summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--mm/Kconfig862
-rw-r--r--mm/Kconfig.debug172
-rw-r--r--mm/Makefile122
-rw-r--r--mm/backing-dev.c1031
-rw-r--r--mm/balloon_compaction.c260
-rw-r--r--mm/cleancache.c315
-rw-r--r--mm/cma.c543
-rw-r--r--mm/cma.h29
-rw-r--r--mm/cma_debug.c200
-rw-r--r--mm/compaction.c2926
-rw-r--r--mm/debug.c324
-rw-r--r--mm/debug_page_ref.c55
-rw-r--r--mm/debug_vm_pgtable.c1147
-rw-r--r--mm/dmapool.c529
-rw-r--r--mm/early_ioremap.c303
-rw-r--r--mm/fadvise.c219
-rw-r--r--mm/failslab.c62
-rw-r--r--mm/filemap.c3536
-rw-r--r--mm/frame_vector.c223
-rw-r--r--mm/frontswap.c497
-rw-r--r--mm/gup.c3040
-rw-r--r--mm/gup_benchmark.c210
-rw-r--r--mm/highmem.c484
-rw-r--r--mm/hmm.c599
-rw-r--r--mm/huge_memory.c3015
-rw-r--r--mm/hugetlb.c5788
-rw-r--r--mm/hugetlb_cgroup.c812
-rw-r--r--mm/hwpoison-inject.c110
-rw-r--r--mm/init-mm.c42
-rw-r--r--mm/internal.h646
-rw-r--r--mm/interval_tree.c111
-rw-r--r--mm/ioremap.c289
-rw-r--r--mm/kasan/Makefile34
-rw-r--r--mm/kasan/common.c931
-rw-r--r--mm/kasan/generic.c369
-rw-r--r--mm/kasan/generic_report.c165
-rw-r--r--mm/kasan/init.c497
-rw-r--r--mm/kasan/kasan.h299
-rw-r--r--mm/kasan/quarantine.c376
-rw-r--r--mm/kasan/report.c599
-rw-r--r--mm/kasan/tags.c200
-rw-r--r--mm/kasan/tags_report.c93
-rw-r--r--mm/khugepaged.c2400
-rw-r--r--mm/kmemleak.c1995
-rw-r--r--mm/ksm.c3205
-rw-r--r--mm/list_lru.c649
-rw-r--r--mm/maccess.c321
-rw-r--r--mm/madvise.c1247
-rw-r--r--mm/mapping_dirty_helpers.c349
-rw-r--r--mm/memblock.c2031
-rw-r--r--mm/memcontrol.c7535
-rw-r--r--mm/memfd.c346
-rw-r--r--mm/memory-failure.c1936
-rw-r--r--mm/memory.c5343
-rw-r--r--mm/memory_hotplug.c1906
-rw-r--r--mm/mempolicy.c3050
-rw-r--r--mm/mempool.c555
-rw-r--r--mm/memremap.c532
-rw-r--r--mm/memtest.c113
-rw-r--r--mm/migrate.c3122
-rw-r--r--mm/mincore.c280
-rw-r--r--mm/mlock.c878
-rw-r--r--mm/mm_init.c207
-rw-r--r--mm/mmap.c3854
-rw-r--r--mm/mmu_gather.c332
-rw-r--r--mm/mmu_notifier.c1139
-rw-r--r--mm/mmzone.c101
-rw-r--r--mm/mprotect.c700
-rw-r--r--mm/mremap.c815
-rw-r--r--mm/msync.c110
-rw-r--r--mm/nommu.c1853
-rw-r--r--mm/oom_kill.c1165
-rw-r--r--mm/page-writeback.c2850
-rw-r--r--mm/page_alloc.c8997
-rw-r--r--mm/page_counter.c262
-rw-r--r--mm/page_ext.c412
-rw-r--r--mm/page_idle.c235
-rw-r--r--mm/page_io.c417
-rw-r--r--mm/page_isolation.c313
-rw-r--r--mm/page_owner.c654
-rw-r--r--mm/page_poison.c144
-rw-r--r--mm/page_reporting.c364
-rw-r--r--mm/page_reporting.h54
-rw-r--r--mm/page_vma_mapped.c318
-rw-r--r--mm/pagewalk.c563
-rw-r--r--mm/percpu-internal.h285
-rw-r--r--mm/percpu-km.c120
-rw-r--r--mm/percpu-stats.c249
-rw-r--r--mm/percpu-vm.c379
-rw-r--r--mm/percpu.c3181
-rw-r--r--mm/pgalloc-track.h51
-rw-r--r--mm/pgtable-generic.c222
-rw-r--r--mm/process_vm_access.c305
-rw-r--r--mm/ptdump.c154
-rw-r--r--mm/readahead.c641
-rw-r--r--mm/rmap.c2019
-rw-r--r--mm/rodata_test.c54
-rw-r--r--mm/shmem.c4350
-rw-r--r--mm/shuffle.c183
-rw-r--r--mm/shuffle.h53
-rw-r--r--mm/slab.c4201
-rw-r--r--mm/slab.h638
-rw-r--r--mm/slab_common.c1197
-rw-r--r--mm/slob.c720
-rw-r--r--mm/slub.c5771
-rw-r--r--mm/sparse-vmemmap.c265
-rw-r--r--mm/sparse.c974
-rw-r--r--mm/swap.c1215
-rw-r--r--mm/swap_cgroup.c227
-rw-r--r--mm/swap_slots.c354
-rw-r--r--mm/swap_state.c953
-rw-r--r--mm/swapfile.c3862
-rw-r--r--mm/truncate.c950
-rw-r--r--mm/usercopy.c312
-rw-r--r--mm/userfaultfd.c694
-rw-r--r--mm/util.c1025
-rw-r--r--mm/vmacache.c117
-rw-r--r--mm/vmalloc.c3589
-rw-r--r--mm/vmpressure.c469
-rw-r--r--mm/vmscan.c4322
-rw-r--r--mm/vmstat.c2182
-rw-r--r--mm/workingset.c629
-rw-r--r--mm/z3fold.c1832
-rw-r--r--mm/zbud.c636
-rw-r--r--mm/zpool.c398
-rw-r--r--mm/zsmalloc.c2581
-rw-r--r--mm/zswap.c1379
127 files changed, 149454 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000..390165ffb
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,862 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Memory Management options"
+
+config SELECT_MEMORY_MODEL
+ def_bool y
+ depends on ARCH_SELECT_MEMORY_MODEL
+
+choice
+ prompt "Memory model"
+ depends on SELECT_MEMORY_MODEL
+ default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+ default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+ default FLATMEM_MANUAL
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here selected by the architecture
+ configuration. This is normal.
+
+config FLATMEM_MANUAL
+ bool "Flat Memory"
+ depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+ help
+ This option is best suited for non-NUMA systems with
+ flat address space. The FLATMEM is the most efficient
+ system in terms of performance and resource consumption
+ and it is the best option for smaller systems.
+
+ For systems that have holes in their physical address
+ spaces and for features like NUMA and memory hotplug,
+ choose "Sparse Memory".
+
+ If unsure, choose this option (Flat Memory) over any other.
+
+config DISCONTIGMEM_MANUAL
+ bool "Discontiguous Memory"
+ depends on ARCH_DISCONTIGMEM_ENABLE
+ help
+ This option provides enhanced support for discontiguous
+ memory systems, over FLATMEM. These systems have holes
+ in their physical address spaces, and this option provides
+ more efficient handling of these holes.
+
+ Although "Discontiguous Memory" is still used by several
+ architectures, it is considered deprecated in favor of
+ "Sparse Memory".
+
+ If unsure, choose "Sparse Memory" over this option.
+
+config SPARSEMEM_MANUAL
+ bool "Sparse Memory"
+ depends on ARCH_SPARSEMEM_ENABLE
+ help
+ This will be the only option for some systems, including
+ memory hot-plug systems. This is normal.
+
+ This option provides efficient support for systems with
+ holes is their physical address space and allows memory
+ hot-plug and hot-remove.
+
+ If unsure, choose "Flat Memory" over this option.
+
+endchoice
+
+config DISCONTIGMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+
+config SPARSEMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
+
+config FLATMEM
+ def_bool y
+ depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+ def_bool y
+ depends on !SPARSEMEM
+
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory. This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+ def_bool y
+ depends on DISCONTIGMEM || NUMA
+
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when sparse_init() is called. If this cannot
+# be done on your architecture, select this option. However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+ bool
+
+#
+# Architecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config SPARSEMEM_EXTREME
+ def_bool y
+ depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+config SPARSEMEM_VMEMMAP_ENABLE
+ bool
+
+config SPARSEMEM_VMEMMAP
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
+
+config HAVE_MEMBLOCK_PHYS_MAP
+ bool
+
+config HAVE_FAST_GUP
+ depends on MMU
+ bool
+
+# Don't discard allocated memory used to track "memory" and "reserved" memblocks
+# after early boot, so it can still be used to test for validity of memory.
+# Also, memblocks are updated with memory hot(un)plug.
+config ARCH_KEEP_MEMBLOCK
+ bool
+
+# Keep arch NUMA mapping infrastructure post-init.
+config NUMA_KEEP_MEMINFO
+ bool
+
+config MEMORY_ISOLATION
+ bool
+
+#
+# Only be set on architectures that have completely implemented memory hotplug
+# feature. If you are not sure, don't touch it.
+#
+config HAVE_BOOTMEM_INFO_NODE
+ def_bool n
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
+ depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on 64BIT || BROKEN
+ select NUMA_KEEP_MEMINFO if NUMA
+
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
+
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+ bool "Online the newly added memory blocks by default"
+ depends on MEMORY_HOTPLUG
+ help
+ This option sets the default policy setting for memory hotplug
+ onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+ determines what happens to newly added memory regions. Policy setting
+ can always be changed at runtime.
+ See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
+
+ Say Y here if you want all hot-plugged memory blocks to appear in
+ 'online' state by default.
+ Say N here if you want the default policy to keep all hot-plugged
+ memory blocks in 'offline' state.
+
+config MEMORY_HOTREMOVE
+ bool "Allow for memory hot remove"
+ select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
+ depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on MIGRATION
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# SPARC32 allocates multiple pte tables within a single page, and therefore
+# a per-page lock leads to problems when multiple tables need to be locked
+# at the same time (e.g. copy_page_range()).
+# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "999999" if !MMU
+ default "999999" if ARM && !CPU_CACHE_VIPT
+ default "999999" if PARISC && !PA20
+ default "999999" if SPARC32
+ default "4"
+
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ bool
+
+#
+# support for memory balloon
+config MEMORY_BALLOON
+ bool
+
+#
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+ bool "Allow for balloon memory compaction/migration"
+ def_bool y
+ depends on COMPACTION && MEMORY_BALLOON
+ help
+ Memory fragmentation introduced by ballooning might reduce
+ significantly the number of 2MB contiguous memory blocks that can be
+ used within a guest, thus imposing performance penalties associated
+ with the reduced number of transparent huge pages that could be used
+ by the guest workload. Allowing the compaction & migration for memory
+ pages enlisted as being part of memory balloon devices avoids the
+ scenario aforementioned and helps improving memory defragmentation.
+
+#
+# support for memory compaction
+config COMPACTION
+ bool "Allow for memory compaction"
+ def_bool y
+ select MIGRATION
+ depends on MMU
+ help
+ Compaction is the only memory management component to form
+ high order (larger physically contiguous) memory blocks
+ reliably. The page allocator relies on compaction heavily and
+ the lack of the feature can lead to unexpected OOM killer
+ invocations for high order memory requests. You shouldn't
+ disable this option unless there really is a strong reason for
+ it and then we would be really interested to hear about that at
+ linux-mm@kvack.org.
+
+#
+# support for free page reporting
+config PAGE_REPORTING
+ bool "Free page reporting"
+ def_bool n
+ help
+ Free page reporting allows for the incremental acquisition of
+ free pages from the buddy allocator for the purpose of reporting
+ those pages to another entity, such as a hypervisor, so that the
+ memory can be freed within the host for other uses.
+
+#
+# support for page migration
+#
+config MIGRATION
+ bool "Page migration"
+ def_bool y
+ depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
+ help
+ Allows the migration of the physical location of pages of processes
+ while the virtual addresses are not changed. This is useful in
+ two situations. The first is on NUMA systems to put pages nearer
+ to the processors accessing. The second is when allocating huge
+ pages as migration can relocate pages to satisfy a huge page
+ allocation instead of reclaiming.
+
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ bool
+
+config ARCH_ENABLE_THP_MIGRATION
+ bool
+
+config CONTIG_ALLOC
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
+config PHYS_ADDR_T_64BIT
+ def_bool 64BIT
+
+config BOUNCE
+ bool "Enable bounce buffers"
+ default y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+ help
+ Enable bounce buffers for devices that cannot access
+ the full range of memory available to the CPU. Enabled
+ by default when ZONE_DMA or HIGHMEM is selected, but you
+ may say n to override this.
+
+config VIRT_TO_BUS
+ bool
+ help
+ An architecture should select this if it implements the
+ deprecated interface virt_to_bus(). All new architectures
+ should probably not select this.
+
+
+config MMU_NOTIFIER
+ bool
+ select SRCU
+ select INTERVAL_TREE
+
+config KSM
+ bool "Enable KSM for page merging"
+ depends on MMU
+ select XXHASH
+ help
+ Enable Kernel Samepage Merging: KSM periodically scans those areas
+ of an application's address space that an app has advised may be
+ mergeable. When it finds pages of identical content, it replaces
+ the many instances by a single page with that content, so
+ saving memory until one or another app needs to modify the content.
+ Recommended for use with KVM, or with other duplicative applications.
+ See Documentation/vm/ksm.rst for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
+
+config DEFAULT_MMAP_MIN_ADDR
+ int "Low address space to protect from user allocation"
+ depends on MMU
+ default 4096
+ help
+ This is the portion of low virtual memory which should be protected
+ from userspace allocation. Keeping a user from writing to low pages
+ can help reduce the impact of kernel NULL pointer bugs.
+
+ For most ia64, ppc64 and x86 users with lots of address space
+ a value of 65536 is reasonable and should cause no problems.
+ On arm and other archs it should not be higher than 32768.
+ Programs which use vm86 functionality or have some need to map
+ this low address space will need CAP_SYS_RAWIO or disable this
+ protection by setting the value to 0.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_min_addr tunable.
+
+config ARCH_SUPPORTS_MEMORY_FAILURE
+ bool
+
+config MEMORY_FAILURE
+ depends on MMU
+ depends on ARCH_SUPPORTS_MEMORY_FAILURE
+ bool "Enable recovery from hardware memory errors"
+ select MEMORY_ISOLATION
+ select RAS
+ help
+ Enables code to recover from some memory failures on systems
+ with MCA recovery. This allows a system to continue running
+ even when some of its memory has uncorrected errors. This requires
+ special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+ tristate "HWPoison pages injector"
+ depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
+ select PROC_PAGE_MONITOR
+
+config NOMMU_INITIAL_TRIM_EXCESS
+ int "Turn on mmap() excess space trimming before booting"
+ depends on !MMU
+ default 1
+ help
+ The NOMMU mmap() frequently needs to allocate large contiguous chunks
+ of memory on which to store mappings, but it can only ask the system
+ allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+ more than it requires. To deal with this, mmap() is able to trim off
+ the excess and return it to the allocator.
+
+ If trimming is enabled, the excess is trimmed off and returned to the
+ system allocator, which can cause extra fragmentation, particularly
+ if there are a lot of transient processes.
+
+ If trimming is disabled, the excess is kept, but not used, which for
+ long-term mappings means that the space is wasted.
+
+ Trimming can be dynamically controlled through a sysctl option
+ (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+ excess pages there must be before trimming should occur, or zero if
+ no trimming is to occur.
+
+ This option specifies the initial value of this option. The default
+ of 1 says that all excess pages should be trimmed.
+
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
+
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select COMPACTION
+ select XARRAY_MULTI
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
+config ARCH_WANTS_THP_SWAP
+ def_bool n
+
+config THP_SWAP
+ def_bool y
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
+ help
+ Swap transparent huge pages in one piece, without splitting.
+ XXX: For now, swap cluster backing transparent huge page
+ will be split after swapout.
+
+ For selection by architectures with reasonable THP sizes.
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
+
+config CLEANCACHE
+ bool "Enable cleancache driver to cache clean pages if tmem is present"
+ help
+ Cleancache can be thought of as a page-granularity victim cache
+ for clean pages that the kernel's pageframe replacement algorithm
+ (PFRA) would like to keep around, but can't since there isn't enough
+ memory. So when the PFRA "evicts" a page, it first attempts to use
+ cleancache code to put the data contained in that page into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. And when a cleancache-enabled
+ filesystem wishes to access a page in a file on disk, it first
+ checks cleancache to see if it already contains it; if it does,
+ the page is copied into the kernel and a disk access is avoided.
+ When a transcendent memory driver is available (such as zcache or
+ Xen transcendent memory), a significant I/O reduction
+ may be achieved. When none is available, all cleancache calls
+ are reduced to a single pointer-compare-against-NULL resulting
+ in a negligible performance hit.
+
+ If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
+
+config CMA
+ bool "Contiguous Memory Allocator"
+ depends on MMU
+ select MIGRATION
+ select MEMORY_ISOLATION
+ help
+ This enables the Contiguous Memory Allocator which allows other
+ subsystems to allocate big physically-contiguous blocks of memory.
+ CMA reserves a region of memory and allows only movable pages to
+ be allocated from it. This way, the kernel can use the memory for
+ pagecache and when a subsystem requests for contiguous area, the
+ allocated pages are migrated away to serve the contiguous request.
+
+ If unsure, say "n".
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL && CMA
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+config CMA_DEBUGFS
+ bool "CMA debugfs interface"
+ depends on CMA && DEBUG_FS
+ help
+ Turns on the DebugFS interface for CMA.
+
+config CMA_AREAS
+ int "Maximum count of the CMA areas"
+ depends on CMA
+ default 19 if NUMA
+ default 7
+ help
+ CMA allows to create CMA areas for particular purpose, mainly,
+ used as device private area. This parameter sets the maximum
+ number of CMA area in the system.
+
+ If unsure, leave the default value "7" in UMA and "19" in NUMA.
+
+config MEM_SOFT_DIRTY
+ bool "Track memory changes"
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
+ select PROC_PAGE_MONITOR
+ help
+ This option enables memory changes tracking by introducing a
+ soft-dirty bit on pte-s. This bit it set when someone writes
+ into a page just as regular dirty bit, but unlike the latter
+ it can be cleared by hands.
+
+ See Documentation/admin-guide/mm/soft-dirty.rst for more details.
+
+config ZSWAP
+ bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ depends on FRONTSWAP && CRYPTO=y
+ select ZPOOL
+ help
+ A lightweight compressed cache for swap pages. It takes
+ pages that are in the process of being swapped out and attempts to
+ compress them into a dynamically allocated RAM-based memory pool.
+ This can result in a significant I/O reduction on swap device and,
+ in the case where decompressing from RAM is faster that swap device
+ reads, can also improve workload performance.
+
+ This is marked experimental because it is a new feature (as of
+ v3.11) that interacts heavily with memory reclaim. While these
+ interactions don't cause any known issues on simple memory setups,
+ they have not be fully explored on the large set of potential
+ configurations and workloads that exist.
+
+choice
+ prompt "Compressed cache for swap pages default compressor"
+ depends on ZSWAP
+ default ZSWAP_COMPRESSOR_DEFAULT_LZO
+ help
+ Selects the default compression algorithm for the compressed cache
+ for swap pages.
+
+ For an overview what kind of performance can be expected from
+ a particular compression algorithm please refer to the benchmarks
+ available at the following LWN page:
+ https://lwn.net/Articles/751795/
+
+ If in doubt, select 'LZO'.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ bool "Deflate"
+ select CRYPTO_DEFLATE
+ help
+ Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+ bool "LZO"
+ select CRYPTO_LZO
+ help
+ Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+ bool "842"
+ select CRYPTO_842
+ help
+ Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ bool "LZ4"
+ select CRYPTO_LZ4
+ help
+ Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ bool "LZ4HC"
+ select CRYPTO_LZ4HC
+ help
+ Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ bool "zstd"
+ select CRYPTO_ZSTD
+ help
+ Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+ string
+ depends on ZSWAP
+ default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+ default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+ default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ default ""
+
+choice
+ prompt "Compressed cache for swap pages default allocator"
+ depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZBUD
+ help
+ Selects the default allocator for the compressed cache for
+ swap pages.
+ The default is 'zbud' for compatibility, however please do
+ read the description of each of the allocators below before
+ making a right choice.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+ bool "zbud"
+ select ZBUD
+ help
+ Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ bool "z3fold"
+ select Z3FOLD
+ help
+ Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ bool "zsmalloc"
+ select ZSMALLOC
+ help
+ Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+ string
+ depends on ZSWAP
+ default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ default ""
+
+config ZSWAP_DEFAULT_ON
+ bool "Enable the compressed cache for swap pages by default"
+ depends on ZSWAP
+ help
+ If selected, the compressed cache for swap pages will be enabled
+ at boot, otherwise it will be disabled.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.enabled=' option.
+
+config ZPOOL
+ tristate "Common API for compressed memory storage"
+ help
+ Compressed memory storage API. This allows using either zbud or
+ zsmalloc.
+
+config ZBUD
+ tristate "Low (Up to 2x) density storage for compressed pages"
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to two compressed pages per physical
+ page. While this design limits storage density, it has simple and
+ deterministic reclaim properties that make it preferable to a higher
+ density approach when reclaim will be used.
+
+config Z3FOLD
+ tristate "Up to 3x density storage for compressed pages"
+ depends on ZPOOL
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to three compressed pages per physical
+ page. It is a ZBUD derivative so the simplicity and determinism are
+ still there.
+
+config ZSMALLOC
+ tristate "Memory allocator for compressed pages"
+ depends on MMU
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ compressed RAM pages. zsmalloc uses virtual memory mapping
+ in order to reduce fragmentation. However, this results in a
+ non-standard allocator interface where a handle, not a pointer, is
+ returned by an alloc(). This handle must be mapped in order to
+ access the allocated space.
+
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
+config GENERIC_EARLY_IOREMAP
+ bool
+
+config MAX_STACK_SIZE_MB
+ int "Maximum user stack size for 32-bit processes (MB)"
+ default 80
+ range 8 2048
+ depends on STACK_GROWSUP && (!64BIT || COMPAT)
+ help
+ This is the maximum stack size in Megabytes in the VM layout of 32-bit
+ user processes when the stack grows upwards (currently only on parisc
+ arch). The stack will be located at the highest memory address minus
+ the given value, unless the RLIMIT_STACK hard limit is changed to a
+ smaller value in which case that is used.
+
+ A sane initial value is 80 MB.
+
+config DEFERRED_STRUCT_PAGE_INIT
+ bool "Defer initialisation of struct pages to kthreads"
+ depends on SPARSEMEM
+ depends on !NEED_PER_CPU_KM
+ depends on 64BIT
+ select PADATA
+ help
+ Ordinarily all struct pages are initialised during early boot in a
+ single thread. On very large machines this can take a considerable
+ amount of time. If this option is set, large machines will bring up
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
+ lifetime of the system until these kthreads finish the
+ initialisation.
+
+config IDLE_PAGE_TRACKING
+ bool "Enable idle page tracking"
+ depends on SYSFS && MMU
+ select PAGE_EXTENSION if !64BIT
+ help
+ This feature allows to estimate the amount of user pages that have
+ not been touched during a given period of time. This information can
+ be useful to tune memory cgroup limits and/or for job placement
+ within a compute cluster.
+
+ See Documentation/admin-guide/mm/idle_page_tracking.rst for
+ more details.
+
+config ARCH_HAS_PTE_DEVMAP
+ bool
+
+config ZONE_DEVICE
+ bool "Device memory (pmem, HMM, etc...) hotplug support"
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+ depends on ARCH_HAS_PTE_DEVMAP
+ select XARRAY_MULTI
+
+ help
+ Device memory hotplug support allows for establishing pmem,
+ or other device driver discovered memory regions, in the
+ memmap. This allows pfn_to_page() lookups of otherwise
+ "device-physical" addresses which is needed for using a DAX
+ mapping in an O_DIRECT operation, among other things.
+
+ If FS_DAX is enabled, then say Y.
+
+config DEV_PAGEMAP_OPS
+ bool
+
+#
+# Helpers to mirror range of the CPU page tables of a process into device page
+# tables.
+#
+config HMM_MIRROR
+ bool
+ depends on MMU
+
+config DEVICE_PRIVATE
+ bool "Unaddressable device memory (GPU memory, ...)"
+ depends on ZONE_DEVICE
+ select DEV_PAGEMAP_OPS
+
+ help
+ Allows creation of struct pages to represent unaddressable device
+ memory; i.e., memory that is only accessible from the device (or
+ group of devices). You likely also want to select HMM_MIRROR.
+
+config VMAP_PFN
+ bool
+
+config FRAME_VECTOR
+ bool
+
+config ARCH_USES_HIGH_VMA_FLAGS
+ bool
+config ARCH_HAS_PKEYS
+ bool
+
+config PERCPU_STATS
+ bool "Collect percpu memory statistics"
+ help
+ This feature collects and exposes statistics via debugfs. The
+ information includes global and per chunk statistics, which can
+ be used to help understand percpu memory usage.
+
+config GUP_BENCHMARK
+ bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
+ help
+ Provides /sys/kernel/debug/gup_benchmark that helps with testing
+ performance of get_user_pages() and related calls.
+
+ See tools/testing/selftests/vm/gup_benchmark.c
+
+config GUP_GET_PTE_LOW_HIGH
+ bool
+
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
+config ARCH_HAS_PTE_SPECIAL
+ bool
+
+#
+# Some architectures require a special hugepage directory format that is
+# required to support multiple hugepage sizes. For example a4fe3ce76
+# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+# introduced it on powerpc. This allows for a more flexible hugepage
+# pagetable layouts.
+#
+config ARCH_HAS_HUGEPD
+ bool
+
+config MAPPING_DIRTY_HELPERS
+ bool
+
+endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000..864f129f1
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config PAGE_EXTENSION
+ bool "Extend memmap on extra space for more information on page"
+ help
+ Extend memmap on extra space for more information on page. This
+ could be used for debugging features that need to insert extra
+ field for every page. This extension enables us to save memory
+ by not allocating this extra memory according to boottime
+ configuration.
+
+config DEBUG_PAGEALLOC
+ bool "Debug page memory allocations"
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ help
+ Unmap pages from the kernel linear mapping after free_pages().
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
+
+ Also, the state of page tracking structures is checked more often as
+ pages are being allocated and freed, as unexpected state changes
+ often happen for same reasons as memory corruption (e.g. double free,
+ use-after-free). The error reports for these checks can be augmented
+ with stack traces of last allocation and freeing of the page, when
+ PAGE_OWNER is also selected and enabled on boot.
+
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally, this option cannot
+ be enabled in combination with hibernation as that would result in
+ incorrect warnings of memory corruption after a resume because free
+ pages are not saved to the suspend image.
+
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ depends on DEBUG_PAGEALLOC
+ help
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
+config PAGE_OWNER
+ bool "Track page owner"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+ select DEBUG_FS
+ select STACKTRACE
+ select STACKDEPOT
+ select PAGE_EXTENSION
+ help
+ This keeps track of what call chain is the owner of a page, may
+ help to find bare alloc_page(s) leaks. Even if you include this
+ feature on your build, it is disabled in default. You should pass
+ "page_owner=on" to boot parameter in order to enable it. Eats
+ a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ for user-space helper.
+
+ If unsure, say N.
+
+config PAGE_POISONING
+ bool "Poison pages after freeing"
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ help
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact if enabled with the
+ "page_poison=1" kernel boot option.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ help
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of debugging value"
+ depends on PAGE_POISONING
+ help
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ If unsure, say N
+
+config DEBUG_PAGE_REF
+ bool "Enable tracepoint to track down page reference manipulation"
+ depends on DEBUG_KERNEL
+ depends on TRACEPOINTS
+ help
+ This is a feature to add tracepoint for tracking down page reference
+ manipulation. This tracking is useful to diagnose functional failure
+ due to migration failures caused by page reference mismatches. Be
+ careful when enabling this feature because it adds about 30 KB to the
+ kernel code. However the runtime performance overhead is virtually
+ nil until the tracepoints are actually enabled.
+
+config DEBUG_RODATA_TEST
+ bool "Testcase for the marking rodata read-only"
+ depends on STRICT_KERNEL_RWX
+ help
+ This option enables a testcase for the setting rodata read-only.
+
+config ARCH_HAS_DEBUG_WX
+ bool
+
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ depends on ARCH_HAS_DEBUG_WX
+ depends on MMU
+ select PTDUMP_CORE
+ help
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving W+X
+ mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ <arch>/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
+config GENERIC_PTDUMP
+ bool
+
+config PTDUMP_CORE
+ bool
+
+config PTDUMP_DEBUGFS
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on GENERIC_PTDUMP
+ select PTDUMP_CORE
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
diff --git a/mm/Makefile b/mm/Makefile
new file mode 100644
index 000000000..d73aed0fc
--- /dev/null
+++ b/mm/Makefile
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux memory manager.
+#
+
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
+KASAN_SANITIZE_slub.o := n
+KCSAN_SANITIZE_kmemleak.o := n
+
+# These produce frequent data race reports: most of them are due to races on
+# the same word but accesses to different bits of that word. Re-enable KCSAN
+# for these when we have more consensus on what to do about them.
+KCSAN_SANITIZE_slab_common.o := n
+KCSAN_SANITIZE_slab.o := n
+KCSAN_SANITIZE_slub.o := n
+KCSAN_SANITIZE_page_alloc.o := n
+
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+KCOV_INSTRUMENT_failslab.o := n
+
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
+mmu-y := nommu.o
+mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
+ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
+ msync.o page_vma_mapped.o pagewalk.o \
+ pgtable-generic.o rmap.o vmalloc.o ioremap.o
+
+
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU) += process_vm_access.o
+endif
+
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+ maccess.o page-writeback.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
+ mm_init.o percpu.o slab_common.o \
+ compaction.o vmacache.o \
+ interval_tree.o list_lru.o workingset.o \
+ debug.o gup.o $(mmu-y)
+
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
+obj-y += init-mm.o
+obj-y += memblock.o
+
+ifdef CONFIG_MMU
+ obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
+endif
+
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
+obj-$(CONFIG_ZSWAP) += zswap.o
+obj-$(CONFIG_HAS_DMA) += dmapool.o
+obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
+obj-$(CONFIG_SLAB) += slab.o
+obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_FAILSLAB) += failslab.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST) += memtest.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
+obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
+obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
+obj-$(CONFIG_PAGE_OWNER) += page_owner.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZPOOL) += zpool.o
+obj-$(CONFIG_ZBUD) += zbud.o
+obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_Z3FOLD) += z3fold.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
+obj-$(CONFIG_CMA) += cma.o
+obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
+obj-$(CONFIG_ZONE_DEVICE) += memremap.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 000000000..dd08ab928
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,1031 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/wait.h>
+#include <linux/rbtree.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+#include <trace/events/writeback.h>
+
+struct backing_dev_info noop_backing_dev_info;
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
+static struct class *bdi_class;
+static const char *bdi_unknown_name = "(unknown)";
+
+/*
+ * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
+ * reader side locking.
+ */
+DEFINE_SPINLOCK(bdi_lock);
+static u64 bdi_id_cursor;
+static struct rb_root bdi_tree = RB_ROOT;
+LIST_HEAD(bdi_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ struct bdi_writeback *wb = &bdi->wb;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long wb_thresh;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
+ struct inode *inode;
+
+ nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_io_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+ nr_more_io++;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+ if (inode->i_state & I_DIRTY_TIME)
+ nr_dirty_time++;
+ spin_unlock(&wb->list_lock);
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ wb_thresh = wb_calc_thresh(wb, dirty_thresh);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
+ (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+ (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+ K(wb_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+ (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+ (unsigned long) K(wb->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
+ nr_dirty_time,
+ !list_empty(&bdi->bdi_list), bdi->wb.state);
+#undef K
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+
+ debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
+ &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove_recursive(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned long read_ahead_kb;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+} \
+static DEVICE_ATTR_RW(name);
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+static ssize_t stable_pages_required_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ dev_warn_once(dev,
+ "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
+ return snprintf(page, PAGE_SIZE-1, "%d\n", 0);
+}
+static DEVICE_ATTR_RO(stable_pages_required);
+
+static struct attribute *bdi_dev_attrs[] = {
+ &dev_attr_read_ahead_kb.attr,
+ &dev_attr_min_ratio.attr,
+ &dev_attr_max_ratio.attr,
+ &dev_attr_stable_pages_required.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(bdi_dev);
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ if (IS_ERR(bdi_class))
+ return PTR_ERR(bdi_class);
+
+ bdi_class->dev_groups = bdi_dev_groups;
+ bdi_debug_init();
+
+ return 0;
+}
+postcore_initcall(bdi_class_init);
+
+static int bdi_init(struct backing_dev_info *bdi);
+
+static int __init default_bdi_init(void)
+{
+ int err;
+
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
+ WQ_SYSFS, 0);
+ if (!bdi_wq)
+ return -ENOMEM;
+
+ err = bdi_init(&noop_backing_dev_info);
+
+ return err;
+}
+subsys_initcall(default_bdi_init);
+
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+ gfp_t gfp)
+{
+ int i, err;
+
+ memset(wb, 0, sizeof(*wb));
+
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
+ spin_lock_init(&wb->list_lock);
+
+ wb->bw_time_stamp = jiffies;
+ wb->balanced_dirty_ratelimit = INIT_BW;
+ wb->dirty_ratelimit = INIT_BW;
+ wb->write_bandwidth = INIT_BW;
+ wb->avg_write_bandwidth = INIT_BW;
+
+ spin_lock_init(&wb->work_lock);
+ INIT_LIST_HEAD(&wb->work_list);
+ INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ wb->dirty_sleep = jiffies;
+
+ err = fprop_local_init_percpu(&wb->completions, gfp);
+ if (err)
+ goto out_put_bdi;
+
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+ err = percpu_counter_init(&wb->stat[i], 0, gfp);
+ if (err)
+ goto out_destroy_stat;
+ }
+
+ return 0;
+
+out_destroy_stat:
+ while (i--)
+ percpu_counter_destroy(&wb->stat[i]);
+ fprop_local_destroy_percpu(&wb->completions);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
+ return err;
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void wb_shutdown(struct bdi_writeback *wb)
+{
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&wb->work_lock);
+ if (!test_and_clear_bit(WB_registered, &wb->state)) {
+ spin_unlock_bh(&wb->work_lock);
+ return;
+ }
+ spin_unlock_bh(&wb->work_lock);
+
+ cgwb_remove_from_bdi_list(wb);
+ /*
+ * Drain work list and shutdown the delayed_work. !WB_registered
+ * tells wb_workfn() that @wb is dying and its work_list needs to
+ * be drained no matter what.
+ */
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ flush_delayed_work(&wb->dwork);
+ WARN_ON(!list_empty(&wb->work_list));
+}
+
+static void wb_exit(struct bdi_writeback *wb)
+{
+ int i;
+
+ WARN_ON(delayed_work_pending(&wb->dwork));
+
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+ percpu_counter_destroy(&wb->stat[i]);
+
+ fprop_local_destroy_percpu(&wb->completions);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
+ * bdi->cgwb_tree is also RCU protected.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static struct workqueue_struct *cgwb_release_wq;
+
+static void cgwb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct bdi_writeback *wb = container_of(rcu_head,
+ struct bdi_writeback, rcu);
+
+ percpu_ref_exit(&wb->refcnt);
+ kfree(wb);
+}
+
+static void cgwb_release_workfn(struct work_struct *work)
+{
+ struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+ release_work);
+ struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
+
+ mutex_lock(&wb->bdi->cgwb_release_mutex);
+ wb_shutdown(wb);
+
+ css_put(wb->memcg_css);
+ css_put(wb->blkcg_css);
+ mutex_unlock(&wb->bdi->cgwb_release_mutex);
+
+ /* triggers blkg destruction if no online users left */
+ blkcg_unpin_online(blkcg);
+
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+ wb_exit(wb);
+ call_rcu(&wb->rcu, cgwb_free_rcu);
+}
+
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+ struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+ refcnt);
+ queue_work(cgwb_release_wq, &wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+ lockdep_assert_held(&cgwb_lock);
+
+ WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+ list_del(&wb->memcg_node);
+ list_del(&wb->blkcg_node);
+ percpu_ref_kill(&wb->refcnt);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ struct cgroup_subsys_state *blkcg_css;
+ struct blkcg *blkcg;
+ struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+ struct bdi_writeback *wb;
+ unsigned long flags;
+ int ret = 0;
+
+ memcg = mem_cgroup_from_css(memcg_css);
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+ blkcg = css_to_blkcg(blkcg_css);
+ memcg_cgwb_list = &memcg->cgwb_list;
+ blkcg_cgwb_list = &blkcg->cgwb_list;
+
+ /* look up again under lock and discard on blkcg mismatch */
+ spin_lock_irqsave(&cgwb_lock, flags);
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb && wb->blkcg_css != blkcg_css) {
+ cgwb_kill(wb);
+ wb = NULL;
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (wb)
+ goto out_put;
+
+ /* need to create a new one */
+ wb = kmalloc(sizeof(*wb), gfp);
+ if (!wb) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ ret = wb_init(wb, bdi, gfp);
+ if (ret)
+ goto err_free;
+
+ ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+ if (ret)
+ goto err_wb_exit;
+
+ ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+ if (ret)
+ goto err_ref_exit;
+
+ wb->memcg_css = memcg_css;
+ wb->blkcg_css = blkcg_css;
+ INIT_WORK(&wb->release_work, cgwb_release_workfn);
+ set_bit(WB_registered, &wb->state);
+
+ /*
+ * The root wb determines the registered state of the whole bdi and
+ * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+ * whether they're still online. Don't link @wb if any is dead.
+ * See wb_memcg_offline() and wb_blkcg_offline().
+ */
+ ret = -ENODEV;
+ spin_lock_irqsave(&cgwb_lock, flags);
+ if (test_bit(WB_registered, &bdi->wb.state) &&
+ blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+ /* we might have raced another instance of this function */
+ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+ if (!ret) {
+ list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
+ list_add(&wb->memcg_node, memcg_cgwb_list);
+ list_add(&wb->blkcg_node, blkcg_cgwb_list);
+ blkcg_pin_online(blkcg);
+ css_get(memcg_css);
+ css_get(blkcg_css);
+ }
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (ret) {
+ if (ret == -EEXIST)
+ ret = 0;
+ goto err_fprop_exit;
+ }
+ goto out_put;
+
+err_fprop_exit:
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+err_ref_exit:
+ percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+ wb_exit(wb);
+err_free:
+ kfree(wb);
+out_put:
+ css_put(blkcg_css);
+ return ret;
+}
+
+/**
+ * wb_get_lookup - get wb for a given memcg
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ *
+ * Try to get the wb for @memcg_css on @bdi. The returned wb has its
+ * refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation. IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough. try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg. As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup. On mismatch, the existing wb is discarded and a new one is
+ * created.
+ */
+struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css)
+{
+ struct bdi_writeback *wb;
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+
+ return wb;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. See wb_get_lookup() for more details.
+ */
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css,
+ gfp_t gfp)
+{
+ struct bdi_writeback *wb;
+
+ might_sleep_if(gfpflags_allow_blocking(gfp));
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ do {
+ wb = wb_get_lookup(bdi, memcg_css);
+ } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+ return wb;
+}
+
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+ int ret;
+
+ INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+ mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
+
+ ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+ if (!ret) {
+ bdi->wb.memcg_css = &root_mem_cgroup->css;
+ bdi->wb.blkcg_css = blkcg_root_css;
+ }
+ return ret;
+}
+
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct bdi_writeback *wb;
+
+ WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+ spin_lock_irq(&cgwb_lock);
+ radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+ cgwb_kill(*slot);
+ spin_unlock_irq(&cgwb_lock);
+
+ mutex_lock(&bdi->cgwb_release_mutex);
+ spin_lock_irq(&cgwb_lock);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
+ mutex_unlock(&bdi->cgwb_release_mutex);
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
+ */
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+ struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+ cgwb_kill(wb);
+ memcg_cgwb_list->next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
+{
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ cgwb_kill(wb);
+ blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
+
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
+static int __init cgwb_init(void)
+{
+ /*
+ * There can be many concurrent release work items overwhelming
+ * system_wq. Put them in a separate wq and limit concurrency.
+ * There's no point in executing many of these in parallel.
+ */
+ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
+ if (!cgwb_release_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+subsys_initcall(cgwb_init);
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+ return wb_init(&bdi->wb, bdi, GFP_KERNEL);
+}
+
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
+
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+static int bdi_init(struct backing_dev_info *bdi)
+{
+ int ret;
+
+ bdi->dev = NULL;
+
+ kref_init(&bdi->refcnt);
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
+ INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->wb_list);
+ init_waitqueue_head(&bdi->wb_waitq);
+
+ ret = cgwb_bdi_init(bdi);
+
+ return ret;
+}
+
+struct backing_dev_info *bdi_alloc(int node_id)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
+ if (!bdi)
+ return NULL;
+
+ if (bdi_init(bdi)) {
+ kfree(bdi);
+ return NULL;
+ }
+ bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
+ bdi->ra_pages = VM_READAHEAD_PAGES;
+ bdi->io_pages = VM_READAHEAD_PAGES;
+ return bdi;
+}
+EXPORT_SYMBOL(bdi_alloc);
+
+static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
+{
+ struct rb_node **p = &bdi_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct backing_dev_info *bdi;
+
+ lockdep_assert_held(&bdi_lock);
+
+ while (*p) {
+ parent = *p;
+ bdi = rb_entry(parent, struct backing_dev_info, rb_node);
+
+ if (bdi->id > id)
+ p = &(*p)->rb_left;
+ else if (bdi->id < id)
+ p = &(*p)->rb_right;
+ else
+ break;
+ }
+
+ if (parentp)
+ *parentp = parent;
+ return p;
+}
+
+/**
+ * bdi_get_by_id - lookup and get bdi from its id
+ * @id: bdi id to lookup
+ *
+ * Find bdi matching @id and get it. Returns NULL if the matching bdi
+ * doesn't exist or is already unregistered.
+ */
+struct backing_dev_info *bdi_get_by_id(u64 id)
+{
+ struct backing_dev_info *bdi = NULL;
+ struct rb_node **p;
+
+ spin_lock_bh(&bdi_lock);
+ p = bdi_lookup_rb_node(id, NULL);
+ if (*p) {
+ bdi = rb_entry(*p, struct backing_dev_info, rb_node);
+ bdi_get(bdi);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return bdi;
+}
+
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
+{
+ struct device *dev;
+ struct rb_node *parent, **p;
+
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
+
+ vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
+ dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ cgwb_bdi_register(bdi);
+ bdi->dev = dev;
+
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(WB_registered, &bdi->wb.state);
+
+ spin_lock_bh(&bdi_lock);
+
+ bdi->id = ++bdi_id_cursor;
+
+ p = bdi_lookup_rb_node(bdi->id, &parent);
+ rb_link_node(&bdi->rb_node, parent, p);
+ rb_insert_color(&bdi->rb_node, &bdi_tree);
+
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
+}
+
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(bdi_register);
+
+void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
+{
+ WARN_ON_ONCE(bdi->owner);
+ bdi->owner = owner;
+ get_device(owner);
+}
+
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ rb_erase(&bdi->rb_node, &bdi_tree);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu_expedited();
+}
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ /* make sure nobody finds us on the bdi_list anymore */
+ bdi_remove_from_list(bdi);
+ wb_shutdown(&bdi->wb);
+ cgwb_bdi_unregister(bdi);
+
+ /*
+ * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
+ * update the global bdi_min_ratio.
+ */
+ if (bdi->min_ratio)
+ bdi_set_min_ratio(bdi, 0);
+
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+
+ if (bdi->owner) {
+ put_device(bdi->owner);
+ bdi->owner = NULL;
+ }
+}
+
+static void release_bdi(struct kref *ref)
+{
+ struct backing_dev_info *bdi =
+ container_of(ref, struct backing_dev_info, refcnt);
+
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ kfree(bdi);
+}
+
+void bdi_put(struct backing_dev_info *bdi)
+{
+ kref_put(&bdi->refcnt, release_bdi);
+}
+EXPORT_SYMBOL(bdi_put);
+
+const char *bdi_dev_name(struct backing_dev_info *bdi)
+{
+ if (!bdi || !bdi->dev)
+ return bdi_unknown_name;
+ return bdi->dev_name;
+}
+EXPORT_SYMBOL_GPL(bdi_dev_name);
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+static atomic_t nr_wb_congested[2];
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+ enum wb_congested_state bit;
+
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (test_and_clear_bit(bit, &bdi->wb.congested))
+ atomic_dec(&nr_wb_congested[sync]);
+ smp_mb__after_atomic();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ enum wb_congested_state bit;
+
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (!test_and_set_bit(bit, &bdi->wb.congested))
+ atomic_inc(&nr_wb_congested[sync]);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion. If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) this waits
+ * for up to @timeout jiffies for either a BDI to exit congestion of the
+ * given @sync queue or a write to complete.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_wb_congested[sync]) == 0) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000..26de020aa
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
+ struct page *page)
+{
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point. If we are not, then
+ * memory corruption is possible and we should stop execution.
+ */
+ BUG_ON(!trylock_page(page));
+ balloon_page_insert(b_dev_info, page);
+ unlock_page(page);
+ __count_vm_event(BALLOON_INFLATE);
+}
+
+/**
+ * balloon_page_list_enqueue() - inserts a list of pages into the balloon page
+ * list.
+ * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * @pages: pages to enqueue - allocated using balloon_page_alloc.
+ *
+ * Driver must call this function to properly enqueue balloon pages before
+ * definitively removing them from the guest system.
+ *
+ * Return: number of pages that were enqueued.
+ */
+size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ size_t n_pages = 0;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_for_each_entry_safe(page, tmp, pages, lru) {
+ list_del(&page->lru);
+ balloon_page_enqueue_one(b_dev_info, page);
+ n_pages++;
+ }
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
+
+/**
+ * balloon_page_list_dequeue() - removes pages from balloon's page list and
+ * returns a list of the pages.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @pages: pointer to the list of pages that would be returned to the caller.
+ * @n_req_pages: number of requested pages.
+ *
+ * Driver must call this function to properly de-allocate a previous enlisted
+ * balloon pages before definitively releasing it back to the guest system.
+ * This function tries to remove @n_req_pages from the ballooned pages and
+ * return them to the caller in the @pages list.
+ *
+ * Note that this function may fail to dequeue some pages even if the balloon
+ * isn't empty - since the page list can be temporarily empty due to compaction
+ * of isolated pages.
+ *
+ * Return: number of pages that were added to the @pages list.
+ */
+size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages, size_t n_req_pages)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ size_t n_pages = 0;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ if (n_pages == n_req_pages)
+ break;
+
+ /*
+ * Block others from accessing the 'page' while we get around to
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (!trylock_page(page))
+ continue;
+
+ if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
+ PageIsolated(page)) {
+ /* raced with isolation */
+ unlock_page(page);
+ continue;
+ }
+ balloon_page_delete(page);
+ __count_vm_event(BALLOON_DEFLATE);
+ list_add(&page->lru, pages);
+ unlock_page(page);
+ n_pages++;
+ }
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+ return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
+
+/*
+ * balloon_page_alloc - allocates a new page for insertion into the balloon
+ * page list.
+ *
+ * Driver must call this function to properly allocate a new balloon page.
+ * Driver must call balloon_page_enqueue before definitively removing the page
+ * from the guest system.
+ *
+ * Return: struct page for the allocated page or NULL on allocation failure.
+ */
+struct page *balloon_page_alloc(void)
+{
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_alloc);
+
+/*
+ * balloon_page_enqueue - inserts a new page into the balloon page list.
+ *
+ * @b_dev_info: balloon device descriptor where we will insert a new page
+ * @page: new page to enqueue - allocated using balloon_page_alloc.
+ *
+ * Drivers must call this function to properly enqueue a new allocated balloon
+ * page before definitively removing the page from the guest system.
+ *
+ * Drivers must not call balloon_page_enqueue on pages that have been pushed to
+ * a list with balloon_page_push before removing them with balloon_page_pop. To
+ * enqueue a list of pages, use balloon_page_list_enqueue instead.
+ */
+void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+ struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ balloon_page_enqueue_one(b_dev_info, page);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ * its address to allow the driver to release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call this function to properly dequeue a previously enqueued page
+ * before definitively releasing it back to the guest system.
+ *
+ * Caller must perform its own accounting to ensure that this
+ * function is called only if some pages are actually enqueued.
+ *
+ * Note that this function may fail to dequeue some pages even if there are
+ * some enqueued pages - since the page list can be temporarily empty due to
+ * the compaction of isolated pages.
+ *
+ * TODO: remove the caller accounting requirements, and allow caller to wait
+ * until all pages can be dequeued.
+ *
+ * Return: struct page for the dequeued page, or NULL if no page was dequeued.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+ unsigned long flags;
+ LIST_HEAD(pages);
+ int n_pages;
+
+ n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1);
+
+ if (n_pages != 1) {
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there are no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck in
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ if (unlikely(list_empty(&b_dev_info->pages) &&
+ !b_dev_info->isolated_pages))
+ BUG();
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ return NULL;
+ }
+ return list_first_entry(&pages, struct page, lru);
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+
+bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+
+{
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_del(&page->lru);
+ b_dev_info->isolated_pages++;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+ return true;
+}
+
+void balloon_page_putback(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_add(&page->lru, &b_dev_info->pages);
+ b_dev_info->isolated_pages--;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ struct balloon_dev_info *balloon = balloon_page_device(page);
+
+ /*
+ * We can not easily support the no copy case here so ignore it as it
+ * is unlikely to be used with balloon pages. See include/linux/hmm.h
+ * for a user of the MIGRATE_SYNC_NO_COPY mode.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+
+ return balloon->migratepage(balloon, newpage, page, mode);
+}
+
+const struct address_space_operations balloon_aops = {
+ .migratepage = balloon_page_migrate,
+ .isolate_page = balloon_page_isolate,
+ .putback_page = balloon_page_putback,
+};
+EXPORT_SYMBOL_GPL(balloon_aops);
+
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000..db7eee9c0
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache. See
+ * Documentation/vm/cleancache.rst for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/cleancache.h>
+
+/*
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static const struct cleancache_ops *cleancache_ops __read_mostly;
+
+/*
+ * Counters available via /sys/kernel/debug/cleancache (if debugfs is
+ * properly configured. These are for information only so are not protected
+ * against increment races.
+ */
+static u64 cleancache_succ_gets;
+static u64 cleancache_failed_gets;
+static u64 cleancache_puts;
+static u64 cleancache_invalidates;
+
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+ switch (sb->cleancache_poolid) {
+ case CLEANCACHE_NO_BACKEND:
+ __cleancache_init_fs(sb);
+ break;
+ case CLEANCACHE_NO_BACKEND_SHARED:
+ __cleancache_init_shared_fs(sb);
+ break;
+ }
+}
+
+/*
+ * Register operations for cleancache. Returns 0 on success.
+ */
+int cleancache_register_ops(const struct cleancache_ops *ops)
+{
+ if (cmpxchg(&cleancache_ops, NULL, ops))
+ return -EBUSY;
+
+ /*
+ * A cleancache backend can be built as a module and hence loaded after
+ * a cleancache enabled filesystem has called cleancache_init_fs. To
+ * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+ * for each active super block. To differentiate between local and
+ * shared filesystems, we temporarily initialize sb->cleancache_poolid
+ * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+ * respectively in case there is no backend registered at the time
+ * cleancache_init_fs or cleancache_init_shared_fs is called.
+ *
+ * Since filesystems can be mounted concurrently with cleancache
+ * backend registration, we have to be careful to guarantee that all
+ * cleancache enabled filesystems that has been mounted by the time
+ * cleancache_register_ops is called has got and all mounted later will
+ * get cleancache_poolid. This is assured by the following statements
+ * tied together:
+ *
+ * a) iterate_supers skips only those super blocks that has started
+ * ->kill_sb
+ *
+ * b) if iterate_supers encounters a super block that has not finished
+ * ->mount yet, it waits until it is finished
+ *
+ * c) cleancache_init_fs is called from ->mount and
+ * cleancache_invalidate_fs is called from ->kill_sb
+ *
+ * d) we call iterate_supers after cleancache_ops has been set
+ *
+ * From a) it follows that if iterate_supers skips a super block, then
+ * either the super block is already dead, in which case we do not need
+ * to bother initializing cleancache for it, or it was mounted after we
+ * initiated iterate_supers. In the latter case, it must have seen
+ * cleancache_ops set according to d) and initialized cleancache from
+ * ->mount by itself according to c). This proves that we call
+ * ->init_fs at least once for each active super block.
+ *
+ * From b) and c) it follows that if iterate_supers encounters a super
+ * block that has already started ->init_fs, it will wait until ->mount
+ * and hence ->init_fs has finished, then check cleancache_poolid, see
+ * that it has already been set and therefore do nothing. This proves
+ * that we call ->init_fs no more than once for each super block.
+ *
+ * Combined together, the last two paragraphs prove the function
+ * correctness.
+ *
+ * Note that various cleancache callbacks may proceed before this
+ * function is called or even concurrently with it, but since
+ * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+ * until the corresponding ->init_fs has been actually called and
+ * cleancache_ops has been set.
+ */
+ iterate_supers(cleancache_register_ops_sb, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+ int pool_id = CLEANCACHE_NO_BACKEND;
+
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
+ }
+ sb->cleancache_poolid = pool_id;
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(struct super_block *sb)
+{
+ int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
+
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
+ }
+ sb->cleancache_poolid = pool_id;
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+ struct cleancache_filekey *key)
+{
+ int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
+ int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+ struct super_block *sb = inode->i_sb;
+
+ key->u.ino = inode->i_ino;
+ if (sb->s_export_op != NULL) {
+ fhfn = sb->s_export_op->encode_fh;
+ if (fhfn) {
+ len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
+ if (len <= FILEID_ROOT || len == FILEID_INVALID)
+ return -1;
+ if (maxlen > CLEANCACHE_KEY_MAX)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+int __cleancache_get_page(struct page *page)
+{
+ int ret = -1;
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops) {
+ cleancache_failed_gets++;
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
+ goto out;
+
+ if (cleancache_get_key(page->mapping->host, &key) < 0)
+ goto out;
+
+ ret = cleancache_ops->get_page(pool_id, key, page->index, page);
+ if (ret == 0)
+ cleancache_succ_gets++;
+ else
+ cleancache_failed_gets++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index. Page must be locked. Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_put_page(struct page *page)
+{
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops) {
+ cleancache_puts++;
+ return;
+ }
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id >= 0 &&
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ cleancache_ops->put_page(pool_id, key, page->index, page);
+ cleancache_puts++;
+ }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Invalidate any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_invalidate_page(struct address_space *mapping,
+ struct page *page)
+{
+ /* careful... page->mapping is NULL sometimes when this is called */
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops)
+ return;
+
+ if (pool_id >= 0) {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (cleancache_get_key(mapping->host, &key) >= 0) {
+ cleancache_ops->invalidate_page(pool_id,
+ key, page->index);
+ cleancache_invalidates++;
+ }
+ }
+}
+EXPORT_SYMBOL(__cleancache_invalidate_page);
+
+/*
+ * Invalidate all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_invalidate_inode(struct address_space *mapping)
+{
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops)
+ return;
+
+ if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+ cleancache_ops->invalidate_inode(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_invalidate_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be returned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs.
+ */
+void __cleancache_invalidate_fs(struct super_block *sb)
+{
+ int pool_id;
+
+ pool_id = sb->cleancache_poolid;
+ sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+ if (cleancache_ops && pool_id >= 0)
+ cleancache_ops->invalidate_fs(pool_id);
+}
+EXPORT_SYMBOL(__cleancache_invalidate_fs);
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("cleancache", NULL);
+
+ debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets);
+ debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets);
+ debugfs_create_u64("puts", 0444, root, &cleancache_puts);
+ debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates);
+#endif
+ return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
new file mode 100644
index 000000000..3c3f1436f
--- /dev/null
+++ b/mm/cma.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Contiguous Memory Allocator
+ *
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Copyright IBM Corporation, 2013
+ * Copyright LG Electronics Inc., 2014
+ * Written by:
+ * Marek Szyprowski <m.szyprowski@samsung.com>
+ * Michal Nazarewicz <mina86@mina86.com>
+ * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ * Joonsoo Kim <iamjoonsoo.kim@lge.com>
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+# define DEBUG
+#endif
+#endif
+#define CREATE_TRACE_POINTS
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/cma.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/kmemleak.h>
+#include <trace/events/cma.h>
+
+#include "cma.h"
+
+struct cma cma_areas[MAX_CMA_AREAS];
+unsigned cma_area_count;
+static DEFINE_MUTEX(cma_mutex);
+
+phys_addr_t cma_get_base(const struct cma *cma)
+{
+ return PFN_PHYS(cma->base_pfn);
+}
+
+unsigned long cma_get_size(const struct cma *cma)
+{
+ return cma->count << PAGE_SHIFT;
+}
+
+const char *cma_get_name(const struct cma *cma)
+{
+ return cma->name;
+}
+
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+ unsigned int align_order)
+{
+ if (align_order <= cma->order_per_bit)
+ return 0;
+ return (1UL << (align_order - cma->order_per_bit)) - 1;
+}
+
+/*
+ * Find the offset of the base PFN from the specified align_order.
+ * The value returned is represented in order_per_bits.
+ */
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ unsigned int align_order)
+{
+ return (cma->base_pfn & ((1UL << align_order) - 1))
+ >> cma->order_per_bit;
+}
+
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
+ unsigned long pages)
+{
+ return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
+}
+
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+ unsigned int count)
+{
+ unsigned long bitmap_no, bitmap_count;
+
+ bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
+ bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+
+ mutex_lock(&cma->lock);
+ bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
+ mutex_unlock(&cma->lock);
+}
+
+static void __init cma_activate_area(struct cma *cma)
+{
+ unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
+ unsigned i = cma->count >> pageblock_order;
+ struct zone *zone;
+
+ cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
+ if (!cma->bitmap)
+ goto out_error;
+
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ zone = page_zone(pfn_to_page(pfn));
+
+ do {
+ unsigned j;
+
+ base_pfn = pfn;
+ for (j = pageblock_nr_pages; j; --j, pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ /*
+ * alloc_contig_range requires the pfn range
+ * specified to be in the same zone. Make this
+ * simple by forcing the entire CMA resv range
+ * to be in the same zone.
+ */
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ goto not_in_zone;
+ }
+ init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+ } while (--i);
+
+ mutex_init(&cma->lock);
+
+#ifdef CONFIG_CMA_DEBUGFS
+ INIT_HLIST_HEAD(&cma->mem_head);
+ spin_lock_init(&cma->mem_head_lock);
+#endif
+
+ return;
+
+not_in_zone:
+ bitmap_free(cma->bitmap);
+out_error:
+ cma->count = 0;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
+}
+
+static int __init cma_init_reserved_areas(void)
+{
+ int i;
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
+
+ return 0;
+}
+core_initcall(cma_init_reserved_areas);
+
+/**
+ * cma_init_reserved_mem() - create custom contiguous area from reserved memory
+ * @base: Base address of the reserved area
+ * @size: Size of the reserved area (in bytes),
+ * @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @name: The name of the area. If this parameter is NULL, the name of
+ * the area will be set to "cmaN", where N is a running counter of
+ * used areas.
+ * @res_cma: Pointer to store the created cma region.
+ *
+ * This function creates custom contiguous area from already reserved memory.
+ */
+int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+ unsigned int order_per_bit,
+ const char *name,
+ struct cma **res_cma)
+{
+ struct cma *cma;
+ phys_addr_t alignment;
+
+ /* Sanity checks */
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size || !memblock_is_region_reserved(base, size))
+ return -EINVAL;
+
+ /* ensure minimal alignment required by mm core */
+ alignment = PAGE_SIZE <<
+ max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
+
+ /* alignment should be aligned with order_per_bit */
+ if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
+ return -EINVAL;
+
+ if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size)
+ return -EINVAL;
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ cma = &cma_areas[cma_area_count];
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
+ cma->base_pfn = PFN_DOWN(base);
+ cma->count = size >> PAGE_SHIFT;
+ cma->order_per_bit = order_per_bit;
+ *res_cma = cma;
+ cma_area_count++;
+ totalcma_pages += (size / PAGE_SIZE);
+
+ return 0;
+}
+
+/**
+ * cma_declare_contiguous_nid() - reserve custom contiguous area
+ * @base: Base address of the reserved area optional, use 0 for any
+ * @size: Size of the reserved area (in bytes),
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ * @alignment: Alignment for the CMA area, should be power of 2 or zero
+ * @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @fixed: hint about where to place the reserved area
+ * @name: The name of the area. See function cma_init_reserved_mem()
+ * @res_cma: Pointer to store the created cma region.
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory. This function allows to create custom reserved areas.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base. If false,
+ * reserve in range from @base to @limit.
+ */
+int __init cma_declare_contiguous_nid(phys_addr_t base,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
+ phys_addr_t memblock_end = memblock_end_of_DRAM();
+ phys_addr_t highmem_start;
+ int ret = 0;
+
+ /*
+ * We can't use __pa(high_memory) directly, since high_memory
+ * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
+ * complain. Find the boundary by adding one to the last valid
+ * address.
+ */
+ highmem_start = __pa(high_memory - 1) + 1;
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size)
+ return -EINVAL;
+
+ if (alignment && !is_power_of_2(alignment))
+ return -EINVAL;
+
+ /*
+ * Sanitise input arguments.
+ * Pages both ends in CMA area could be merged into adjacent unmovable
+ * migratetype page by page allocator's buddy algorithm. In the case,
+ * you couldn't get a contiguous memory, which is not what we want.
+ */
+ alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
+ max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
+ base = ALIGN(base, alignment);
+ size = ALIGN(size, alignment);
+ limit &= ~(alignment - 1);
+
+ if (!base)
+ fixed = false;
+
+ /* size should be aligned with order_per_bit */
+ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
+ return -EINVAL;
+
+ /*
+ * If allocating at a fixed base the request region must not cross the
+ * low/high memory boundary.
+ */
+ if (fixed && base < highmem_start && base + size > highmem_start) {
+ ret = -EINVAL;
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
+ goto err;
+ }
+
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
+ /* Reserve memory */
+ if (fixed) {
+ if (memblock_is_region_reserved(base, size) ||
+ memblock_reserve(base, size) < 0) {
+ ret = -EBUSY;
+ goto err;
+ }
+ } else {
+ phys_addr_t addr = 0;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem_start && limit > highmem_start) {
+ addr = memblock_alloc_range_nid(size, alignment,
+ highmem_start, limit, nid, true);
+ limit = highmem_start;
+ }
+
+ if (!addr) {
+ addr = memblock_alloc_range_nid(size, alignment, base,
+ limit, nid, true);
+ if (!addr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+
+ /*
+ * kmemleak scans/reads tracked objects for pointers to other
+ * objects but this address isn't mapped and accessible
+ */
+ kmemleak_ignore_phys(addr);
+ base = addr;
+ }
+
+ ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
+ if (ret)
+ goto free_mem;
+
+ pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
+ &base);
+ return 0;
+
+free_mem:
+ memblock_free(base, size);
+err:
+ pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+ return ret;
+}
+
+#ifdef CONFIG_CMA_DEBUG
+static void cma_debug_show_areas(struct cma *cma)
+{
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
+ unsigned long start = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
+
+ mutex_lock(&cma->lock);
+ pr_info("number of available pages: ");
+ for (;;) {
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
+ break;
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
+ nr_zero = next_set_bit - next_zero_bit;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
+ start = next_zero_bit + nr_zero;
+ }
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
+ mutex_unlock(&cma->lock);
+}
+#else
+static inline void cma_debug_show_areas(struct cma *cma) { }
+#endif
+
+/**
+ * cma_alloc() - allocate pages from contiguous area
+ * @cma: Contiguous memory region for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @no_warn: Avoid printing message about failed allocation
+ *
+ * This function allocates part of contiguous memory on specific
+ * contiguous memory area.
+ */
+struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+ bool no_warn)
+{
+ unsigned long mask, offset;
+ unsigned long pfn = -1;
+ unsigned long start = 0;
+ unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+ size_t i;
+ struct page *page = NULL;
+ int ret = -ENOMEM;
+
+ if (!cma || !cma->count || !cma->bitmap)
+ return NULL;
+
+ pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+ count, align);
+
+ if (!count)
+ return NULL;
+
+ mask = cma_bitmap_aligned_mask(cma, align);
+ offset = cma_bitmap_aligned_offset(cma, align);
+ bitmap_maxno = cma_bitmap_maxno(cma);
+ bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+
+ if (bitmap_count > bitmap_maxno)
+ return NULL;
+
+ for (;;) {
+ mutex_lock(&cma->lock);
+ bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ bitmap_maxno, start, bitmap_count, mask,
+ offset);
+ if (bitmap_no >= bitmap_maxno) {
+ mutex_unlock(&cma->lock);
+ break;
+ }
+ bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
+ /*
+ * It's safe to drop the lock here. We've marked this region for
+ * our exclusive use. If the migration fails we will take the
+ * lock again and unmark it.
+ */
+ mutex_unlock(&cma->lock);
+
+ pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
+ mutex_lock(&cma_mutex);
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
+ GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
+ mutex_unlock(&cma_mutex);
+ if (ret == 0) {
+ page = pfn_to_page(pfn);
+ break;
+ }
+
+ cma_clear_bitmap(cma, pfn, count);
+ if (ret != -EBUSY)
+ break;
+
+ pr_debug("%s(): memory range at %p is busy, retrying\n",
+ __func__, pfn_to_page(pfn));
+ /* try again with a bit different memory target */
+ start = bitmap_no + mask + 1;
+ }
+
+ trace_cma_alloc(pfn, page, count, align);
+
+ /*
+ * CMA can allocate multiple page blocks, which results in different
+ * blocks being marked with different tags. Reset the tags to ignore
+ * those page blocks.
+ */
+ if (page) {
+ for (i = 0; i < count; i++)
+ page_kasan_tag_reset(nth_page(page, i));
+ }
+
+ if (ret && !no_warn) {
+ pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
+ __func__, count, ret);
+ cma_debug_show_areas(cma);
+ }
+
+ pr_debug("%s(): returned %p\n", __func__, page);
+ return page;
+}
+
+/**
+ * cma_release() - release allocated pages
+ * @cma: Contiguous memory region for which the allocation is performed.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by cma_alloc().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+{
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+ return false;
+
+ VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+ free_contig_range(pfn, count);
+ cma_clear_bitmap(cma, pfn, count);
+ trace_cma_release(pfn, pages, count);
+
+ return true;
+}
+
+int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
+{
+ int i;
+
+ for (i = 0; i < cma_area_count; i++) {
+ int ret = it(&cma_areas[i], data);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000..42ae082cb
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+
+#include <linux/debugfs.h>
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+ unsigned int order_per_bit; /* Order of pages represented by one bit */
+ struct mutex lock;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct hlist_head mem_head;
+ spinlock_t mem_head_lock;
+ struct debugfs_u32_array dfs_bitmap;
+#endif
+ char name[CMA_MAX_NAME];
+};
+
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+
+static inline unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+ return cma->count >> cma->order_per_bit;
+}
+
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000..d5bf8aa34
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+
+
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "cma.h"
+
+struct cma_mem {
+ struct hlist_node node;
+ struct page *p;
+ unsigned long n;
+};
+
+static int cma_debugfs_get(void *data, u64 *val)
+{
+ unsigned long *p = data;
+
+ *val = *p;
+
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+
+static int cma_used_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long used;
+
+ mutex_lock(&cma->lock);
+ /* pages counter is smaller than sizeof(int) */
+ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
+ mutex_unlock(&cma->lock);
+ *val = (u64)used << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+
+static int cma_maxchunk_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long maxchunk = 0;
+ unsigned long start, end = 0;
+ unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
+
+ mutex_lock(&cma->lock);
+ for (;;) {
+ start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
+ if (start >= bitmap_maxno)
+ break;
+ end = find_next_bit(cma->bitmap, bitmap_maxno, start);
+ maxchunk = max(end - start, maxchunk);
+ }
+ mutex_unlock(&cma->lock);
+ *val = (u64)maxchunk << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
+
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+ spin_lock(&cma->mem_head_lock);
+ hlist_add_head(&mem->node, &cma->mem_head);
+ spin_unlock(&cma->mem_head_lock);
+}
+
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+ struct cma_mem *mem = NULL;
+
+ spin_lock(&cma->mem_head_lock);
+ if (!hlist_empty(&cma->mem_head)) {
+ mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+ hlist_del_init(&mem->node);
+ }
+ spin_unlock(&cma->mem_head_lock);
+
+ return mem;
+}
+
+static int cma_free_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem = NULL;
+
+ while (count) {
+ mem = cma_get_entry_from_list(cma);
+ if (mem == NULL)
+ return 0;
+
+ if (mem->n <= count) {
+ cma_release(cma, mem->p, mem->n);
+ count -= mem->n;
+ kfree(mem);
+ } else if (cma->order_per_bit == 0) {
+ cma_release(cma, mem->p, count);
+ mem->p += count;
+ mem->n -= count;
+ count = 0;
+ cma_add_to_cma_mem_list(cma, mem);
+ } else {
+ pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+ cma_add_to_cma_mem_list(cma, mem);
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static int cma_free_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_free_mem(cma, pages);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem;
+ struct page *p;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ p = cma_alloc(cma, count, 0, false);
+ if (!p) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->p = p;
+ mem->n = count;
+
+ cma_add_to_cma_mem_list(cma, mem);
+
+ return 0;
+}
+
+static int cma_alloc_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_alloc_mem(cma, pages);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+
+static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
+{
+ struct dentry *tmp;
+ char name[16];
+
+ scnprintf(name, sizeof(name), "cma-%s", cma->name);
+
+ tmp = debugfs_create_dir(name, root_dentry);
+
+ debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
+ debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
+ debugfs_create_file("base_pfn", 0444, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", 0444, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+ debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
+ debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
+
+ cma->dfs_bitmap.array = (u32 *)cma->bitmap;
+ cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
+}
+
+static int __init cma_debugfs_init(void)
+{
+ struct dentry *cma_debugfs_root;
+ int i;
+
+ cma_debugfs_root = debugfs_create_dir("cma", NULL);
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
+
+ return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 000000000..b58021666
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,2926 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/compaction.c
+ *
+ * Memory compaction for the reduction of external fragmentation. Note that
+ * this heavily depends upon page migration to do all the real heavy
+ * lifting
+ *
+ * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
+ */
+#include <linux/cpu.h>
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/signal.h>
+#include <linux/backing-dev.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include <linux/page-isolation.h>
+#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/page_owner.h>
+#include <linux/psi.h>
+#include "internal.h"
+
+#ifdef CONFIG_COMPACTION
+static inline void count_compact_event(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
+
+static inline void count_compact_events(enum vm_event_item item, long delta)
+{
+ count_vm_events(item, delta);
+}
+#else
+#define count_compact_event(item) do { } while (0)
+#define count_compact_events(item, delta) do { } while (0)
+#endif
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
+#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined CONFIG_HUGETLBFS
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
+
+static unsigned long release_freepages(struct list_head *freelist)
+{
+ struct page *page, *next;
+ unsigned long high_pfn = 0;
+
+ list_for_each_entry_safe(page, next, freelist, lru) {
+ unsigned long pfn = page_to_pfn(page);
+ list_del(&page->lru);
+ __free_page(page);
+ if (pfn > high_pfn)
+ high_pfn = pfn;
+ }
+
+ return high_pfn;
+}
+
+static void split_map_pages(struct list_head *list)
+{
+ unsigned int i, order, nr_pages;
+ struct page *page, *next;
+ LIST_HEAD(tmp_list);
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+
+ order = page_private(page);
+ nr_pages = 1 << order;
+
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ if (order)
+ split_page(page, order);
+
+ for (i = 0; i < nr_pages; i++) {
+ list_add(&page->lru, &tmp_list);
+ page++;
+ }
+ }
+
+ list_splice(&tmp_list, list);
+}
+
+#ifdef CONFIG_COMPACTION
+
+int PageMovable(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (!__PageMovable(page))
+ return 0;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(PageMovable);
+
+void __SetPageMovable(struct page *page, struct address_space *mapping)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__SetPageMovable);
+
+void __ClearPageMovable(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ /*
+ * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
+ * flag so that VM can catch up released page by driver after isolation.
+ * With it, VM migration doesn't try to put it back.
+ */
+ page->mapping = (void *)((unsigned long)page->mapping &
+ PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__ClearPageMovable);
+
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_shift, compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+ zone->compact_considered = 0;
+ zone->compact_defer_shift++;
+
+ if (order < zone->compact_order_failed)
+ zone->compact_order_failed = order;
+
+ if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+ zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+
+ trace_mm_compaction_defer_compaction(zone, order);
+}
+
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+ unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+
+ if (order < zone->compact_order_failed)
+ return false;
+
+ /* Avoid possible overflow */
+ if (++zone->compact_considered >= defer_limit) {
+ zone->compact_considered = defer_limit;
+ return false;
+ }
+
+ trace_mm_compaction_deferred(zone, order);
+
+ return true;
+}
+
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+ bool alloc_success)
+{
+ if (alloc_success) {
+ zone->compact_considered = 0;
+ zone->compact_defer_shift = 0;
+ }
+ if (order >= zone->compact_order_failed)
+ zone->compact_order_failed = order + 1;
+
+ trace_mm_compaction_defer_reset(zone, order);
+}
+
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+ if (order < zone->compact_order_failed)
+ return false;
+
+ return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+ zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ if (cc->ignore_skip_hint)
+ return true;
+
+ return !get_pageblock_skip(page);
+}
+
+static void reset_cached_positions(struct zone *zone)
+{
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn =
+ pageblock_start_pfn(zone_end_pfn(zone) - 1);
+}
+
+/*
+ * Compound pages of >= pageblock_order should consistenly be skipped until
+ * released. It is always pointless to compact pages of such order (if they are
+ * migratable), and the pageblocks they occupy cannot contain any free pages.
+ */
+static bool pageblock_skip_persistent(struct page *page)
+{
+ if (!PageCompound(page))
+ return false;
+
+ page = compound_head(page);
+
+ if (compound_order(page) >= pageblock_order)
+ return true;
+
+ return false;
+}
+
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *block_page;
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /* Ensure the start of the pageblock or zone is online and valid */
+ block_pfn = pageblock_start_pfn(pfn);
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
+ if (block_page) {
+ page = block_page;
+ pfn = block_pfn;
+ }
+
+ /* Ensure the end of the pageblock or zone is online and valid */
+ block_pfn = pageblock_end_pfn(pfn) - 1;
+ block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
+ end_page = pfn_to_online_page(block_pfn);
+ if (!end_page)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page <= end_page);
+
+ return false;
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone) - 1;
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
+
+ zone->compact_blockskip_flush = false;
+
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
+ cond_resched();
+
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
+
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
+ }
+
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
+}
+
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Only flush if a full compaction finished recently */
+ if (zone->compact_blockskip_flush)
+ __reset_isolation_suitable(zone);
+ }
+}
+
+/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (!page)
+ return;
+
+ set_pageblock_skip(page);
+
+ /* Update where async and sync compaction should restart */
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ return true;
+}
+
+static inline bool pageblock_skip_persistent(struct page *page)
+{
+ return false;
+}
+
+static inline void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long pfn)
+{
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
+ *
+ * Always returns true which makes it easier to track lock state in callers.
+ */
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
+ struct compact_control *cc)
+ __acquires(lock)
+{
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
+ }
+
+ spin_lock_irqsave(lock, *flags);
+ return true;
+}
+
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. The lock should be periodically unlocked to avoid
+ * having disabled IRQs for a long time, even when there is nobody waiting on
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, async compaction
+ * aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
+ *
+ * Returns true if compaction should abort due to fatal signal pending, or
+ * async compaction due to need_resched()
+ * Returns false when compaction can continue (sync compaction might have
+ * scheduled)
+ */
+static bool compact_unlock_should_abort(spinlock_t *lock,
+ unsigned long flags, bool *locked, struct compact_control *cc)
+{
+ if (*locked) {
+ spin_unlock_irqrestore(lock, flags);
+ *locked = false;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+ return true;
+ }
+
+ cond_resched();
+
+ return false;
+}
+
+/*
+ * Isolate free pages onto a private freelist. If @strict is true, will abort
+ * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
+ * (even though it may still end up isolating some pages).
+ */
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+ unsigned long *start_pfn,
+ unsigned long end_pfn,
+ struct list_head *freelist,
+ unsigned int stride,
+ bool strict)
+{
+ int nr_scanned = 0, total_isolated = 0;
+ struct page *cursor;
+ unsigned long flags = 0;
+ bool locked = false;
+ unsigned long blockpfn = *start_pfn;
+ unsigned int order;
+
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
+ cursor = pfn_to_page(blockpfn);
+
+ /* Isolate free pages. */
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
+ int isolated;
+ struct page *page = cursor;
+
+ /*
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort if fatal signal
+ * pending or async compaction detects need_resched()
+ */
+ if (!(blockpfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&cc->zone->lock, flags,
+ &locked, cc))
+ break;
+
+ nr_scanned++;
+ if (!pfn_valid_within(blockpfn))
+ goto isolate_fail;
+
+ /*
+ * For compound pages such as THP and hugetlbfs, we can save
+ * potentially a lot of iterations if we skip them at once.
+ * The check is racy, but we can consider only valid values
+ * and the only danger is skipping too much.
+ */
+ if (PageCompound(page)) {
+ const unsigned int order = compound_order(page);
+
+ if (likely(order < MAX_ORDER)) {
+ blockpfn += (1UL << order) - 1;
+ cursor += (1UL << order) - 1;
+ }
+ goto isolate_fail;
+ }
+
+ if (!PageBuddy(page))
+ goto isolate_fail;
+
+ /*
+ * If we already hold the lock, we can skip some rechecking.
+ * Note that if we hold the lock now, checked_pageblock was
+ * already set in some previous iteration (or strict is true),
+ * so it is correct to skip the suitable migration target
+ * recheck as well.
+ */
+ if (!locked) {
+ locked = compact_lock_irqsave(&cc->zone->lock,
+ &flags, cc);
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+ goto isolate_fail;
+ }
+
+ /* Found a free page, will break it into order-0 pages */
+ order = buddy_order(page);
+ isolated = __isolate_free_page(page, order);
+ if (!isolated)
+ break;
+ set_page_private(page, order);
+
+ total_isolated += isolated;
+ cc->nr_freepages += isolated;
+ list_add_tail(&page->lru, freelist);
+
+ if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
+ blockpfn += isolated;
+ break;
+ }
+ /* Advance to the end of split page */
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ continue;
+
+isolate_fail:
+ if (strict)
+ break;
+ else
+ continue;
+
+ }
+
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * There is a tiny chance that we have read bogus compound_order(),
+ * so be careful to not go outside of the pageblock.
+ */
+ if (unlikely(blockpfn > end_pfn))
+ blockpfn = end_pfn;
+
+ trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
+ nr_scanned, total_isolated);
+
+ /* Record how far we have got within the block */
+ *start_pfn = blockpfn;
+
+ /*
+ * If strict isolation is requested by CMA then check that all the
+ * pages requested were isolated. If there were any failures, 0 is
+ * returned and CMA will fail.
+ */
+ if (strict && blockpfn < end_pfn)
+ total_isolated = 0;
+
+ cc->total_free_scanned += nr_scanned;
+ if (total_isolated)
+ count_compact_events(COMPACTISOLATED, total_isolated);
+ return total_isolated;
+}
+
+/**
+ * isolate_freepages_range() - isolate free pages.
+ * @cc: Compaction control structure.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Non-free pages, invalid PFNs, or zone boundaries within the
+ * [start_pfn, end_pfn) range are considered errors, cause function to
+ * undo its actions and return zero.
+ *
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater then end_pfn if end fell in a middle of
+ * a free page).
+ */
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
+ LIST_HEAD(freelist);
+
+ pfn = start_pfn;
+ block_start_pfn = pageblock_start_pfn(pfn);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+ block_end_pfn = pageblock_end_pfn(pfn);
+
+ for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+ /* Protect pfn from changing by isolate_freepages_block */
+ unsigned long isolate_start_pfn = pfn;
+
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ /*
+ * pfn could pass the block_end_pfn if isolated freepage
+ * is more than pageblock order. In this case, we adjust
+ * scanning range to right one.
+ */
+ if (pfn >= block_end_pfn) {
+ block_start_pfn = pageblock_start_pfn(pfn);
+ block_end_pfn = pageblock_end_pfn(pfn);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+ }
+
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
+ break;
+
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, &freelist, 0, true);
+
+ /*
+ * In strict mode, isolate_freepages_block() returns 0 if
+ * there are any holes in the block (ie. invalid PFNs or
+ * non-free pages).
+ */
+ if (!isolated)
+ break;
+
+ /*
+ * If we managed to isolate pages, it is always (1 << n) *
+ * pageblock_nr_pages for some non-negative n. (Max order
+ * page may span two pageblocks).
+ */
+ }
+
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(&freelist);
+
+ if (pfn < end_pfn) {
+ /* Loop terminated early, cleanup. */
+ release_freepages(&freelist);
+ return 0;
+ }
+
+ /* We don't use freelists for anything. */
+ return pfn;
+}
+
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(pg_data_t *pgdat)
+{
+ unsigned long active, inactive, isolated;
+
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_ANON);
+ active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+ node_page_state(pgdat, NR_ISOLATED_ANON);
+
+ return isolated > (inactive + active) / 2;
+}
+
+/**
+ * isolate_migratepages_block() - isolate all migrate-able pages within
+ * a single pageblock
+ * @cc: Compaction control structure.
+ * @low_pfn: The first PFN to isolate
+ * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
+ * @isolate_mode: Isolation mode to be used.
+ *
+ * Isolate all pages that can be migrated from the range specified by
+ * [low_pfn, end_pfn). The range is expected to be within same pageblock.
+ * Returns zero if there is a fatal signal pending, otherwise PFN of the
+ * first page that was not scanned (which may be both less, equal to or more
+ * than end_pfn).
+ *
+ * The pages are isolated on cc->migratepages list (not required to be empty),
+ * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
+ * is neither read nor updated.
+ */
+static unsigned long
+isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
+ unsigned long end_pfn, isolate_mode_t isolate_mode)
+{
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
+ struct lruvec *lruvec;
+ unsigned long flags = 0;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
+ unsigned long start_pfn = low_pfn;
+ bool skip_on_failure = false;
+ unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+ * list by either parallel reclaimers or compaction. If there are,
+ * delay for some time until fewer pages are isolated
+ */
+ while (unlikely(too_many_isolated(pgdat))) {
+ /* stop isolation if there are still pages not migrated */
+ if (cc->nr_migratepages)
+ return 0;
+
+ /* async migration should just abort */
+ if (cc->mode == MIGRATE_ASYNC)
+ return 0;
+
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ if (fatal_signal_pending(current))
+ return 0;
+ }
+
+ cond_resched();
+
+ if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+ skip_on_failure = true;
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
+ /* Time to isolate some pages for migration */
+ for (; low_pfn < end_pfn; low_pfn++) {
+
+ if (skip_on_failure && low_pfn >= next_skip_pfn) {
+ /*
+ * We have isolated all migration candidates in the
+ * previous order-aligned block, and did not skip it due
+ * to failure. We should migrate the pages now and
+ * hopefully succeed compaction.
+ */
+ if (nr_isolated)
+ break;
+
+ /*
+ * We failed to isolate in the previous order-aligned
+ * block. Set the new boundary to the end of the
+ * current block. Note we can't simply increase
+ * next_skip_pfn by 1 << order, as low_pfn might have
+ * been incremented by a higher number due to skipping
+ * a compound or a high-order buddy page in the
+ * previous loop iteration.
+ */
+ next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+ }
+
+ /*
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort completely if
+ * a fatal signal is pending.
+ */
+ if (!(low_pfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&pgdat->lru_lock,
+ flags, &locked, cc)) {
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ if (!pfn_valid_within(low_pfn))
+ goto isolate_fail;
+ nr_scanned++;
+
+ page = pfn_to_page(low_pfn);
+
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
+ valid_page = page;
+ }
+
+ /*
+ * Skip if free. We read page order here without zone lock
+ * which is generally unsafe, but the race window is small and
+ * the worst thing that can happen is that we skip some
+ * potential isolation targets.
+ */
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = buddy_order_unsafe(page);
+
+ /*
+ * Without lock, we cannot be sure that what we got is
+ * a valid page order. Consider only values in the
+ * valid order range to prevent low_pfn overflow.
+ */
+ if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ low_pfn += (1UL << freepage_order) - 1;
+ continue;
+ }
+
+ /*
+ * Regardless of being on LRU, compound pages such as THP and
+ * hugetlbfs are not to be compacted unless we are attempting
+ * an allocation much larger than the huge page size (eg CMA).
+ * We can potentially save a lot of iterations if we skip them
+ * at once. The check is racy, but we can consider only valid
+ * values and the only danger is skipping too much.
+ */
+ if (PageCompound(page) && !cc->alloc_contig) {
+ const unsigned int order = compound_order(page);
+
+ if (likely(order < MAX_ORDER))
+ low_pfn += (1UL << order) - 1;
+ goto isolate_fail;
+ }
+
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU and non-lru movable pages.
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ /*
+ * __PageMovable can return false positive so we need
+ * to verify it under page_lock.
+ */
+ if (unlikely(__PageMovable(page)) &&
+ !PageIsolated(page)) {
+ if (locked) {
+ spin_unlock_irqrestore(&pgdat->lru_lock,
+ flags);
+ locked = false;
+ }
+
+ if (!isolate_movable_page(page, isolate_mode))
+ goto isolate_success;
+ }
+
+ goto isolate_fail;
+ }
+
+ /*
+ * Migration will fail if an anonymous page is pinned in memory,
+ * so avoid taking lru_lock and isolating it unnecessarily in an
+ * admittedly racy check.
+ */
+ if (!page_mapping(page) &&
+ page_count(page) > page_mapcount(page))
+ goto isolate_fail;
+
+ /*
+ * Only allow to migrate anonymous pages in GFP_NOFS context
+ * because those do not depend on fs locks.
+ */
+ if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ goto isolate_fail;
+
+ /* If we already hold the lock, we can skip some rechecking */
+ if (!locked) {
+ locked = compact_lock_irqsave(&pgdat->lru_lock,
+ &flags, cc);
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
+
+ /* Recheck PageLRU and PageCompound under lock */
+ if (!PageLRU(page))
+ goto isolate_fail;
+
+ /*
+ * Page become compound since the non-locked check,
+ * and it's on LRU. It can only be a THP so the order
+ * is safe to read and it's 0 for tail pages.
+ */
+ if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
+ low_pfn += compound_nr(page) - 1;
+ goto isolate_fail;
+ }
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
+ /* Try isolate the page */
+ if (__isolate_lru_page(page, isolate_mode) != 0)
+ goto isolate_fail;
+
+ /* The whole page is taken off the LRU; skip the tail pages. */
+ if (PageCompound(page))
+ low_pfn += compound_nr(page) - 1;
+
+ /* Successfully isolated */
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ thp_nr_pages(page));
+
+isolate_success:
+ list_add(&page->lru, &cc->migratepages);
+ cc->nr_migratepages += compound_nr(page);
+ nr_isolated += compound_nr(page);
+
+ /*
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
+ */
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
+ ++low_pfn;
+ break;
+ }
+
+ continue;
+isolate_fail:
+ if (!skip_on_failure)
+ continue;
+
+ /*
+ * We have isolated some pages, but then failed. Release them
+ * instead of migrating, as we cannot form the cc->order buddy
+ * page anyway.
+ */
+ if (nr_isolated) {
+ if (locked) {
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ locked = false;
+ }
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ nr_isolated = 0;
+ }
+
+ if (low_pfn < next_skip_pfn) {
+ low_pfn = next_skip_pfn - 1;
+ /*
+ * The check near the loop beginning would have updated
+ * next_skip_pfn too, but this is a bit simpler.
+ */
+ next_skip_pfn += 1UL << cc->order;
+ }
+ }
+
+ /*
+ * The PageBuddy() check could have potentially brought us outside
+ * the range to be scanned.
+ */
+ if (unlikely(low_pfn > end_pfn))
+ low_pfn = end_pfn;
+
+isolate_abort:
+ if (locked)
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+
+ /*
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
+ */
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
+
+ trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
+ nr_scanned, nr_isolated);
+
+fatal_pending:
+ cc->total_migrate_scanned += nr_scanned;
+ if (nr_isolated)
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+
+ return low_pfn;
+}
+
+/**
+ * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
+ * @cc: Compaction control structure.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Returns zero if isolation fails fatally due to e.g. pending signal.
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ */
+unsigned long
+isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn, block_start_pfn, block_end_pfn;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = start_pfn;
+ block_start_pfn = pageblock_start_pfn(pfn);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+ block_end_pfn = pageblock_end_pfn(pfn);
+
+ for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
+ continue;
+
+ pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
+
+ if (!pfn)
+ break;
+
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
+ break;
+ }
+
+ return pfn;
+}
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+#ifdef CONFIG_COMPACTION
+
+static bool suitable_migration_source(struct compact_control *cc,
+ struct page *page)
+{
+ int block_mt;
+
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
+ return true;
+
+ block_mt = get_pageblock_migratetype(page);
+
+ if (cc->migratetype == MIGRATE_MOVABLE)
+ return is_migrate_movable(block_mt);
+ else
+ return block_mt == cc->migratetype;
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct compact_control *cc,
+ struct page *page)
+{
+ /* If the page is a large free page, then disallow migration */
+ if (PageBuddy(page)) {
+ /*
+ * We are checking page_order without zone->lock taken. But
+ * the only small danger is that we skip a potentially suitable
+ * pageblock, so it's not worth to check order for valid range.
+ */
+ if (buddy_order_unsafe(page) >= pageblock_order)
+ return false;
+ }
+
+ if (cc->ignore_block_suitable)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (is_migrate_movable(get_pageblock_migratetype(page)))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ unsigned short shift = BITS_PER_LONG - 1;
+
+ return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
+}
+
+/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+ return (cc->free_pfn >> pageblock_order)
+ <= (cc->migrate_pfn >> pageblock_order);
+}
+
+/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_first(freelist, &freepage->lru)) {
+ list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *page;
+
+ /* Do not search around if there are enough pages already */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+
+ /* Minimise scanning during async compaction */
+ if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+ return;
+
+ /* Pageblock boundaries */
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
+
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
+
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+ /* Skip this pageblock in the future as it's full or nearly full */
+ if (cc->nr_freepages < cc->nr_migratepages)
+ set_pageblock_skip(page);
+
+ return;
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+ order--;
+ if (order < 0)
+ order = cc->order - 1;
+
+ /* Search wrapped around? */
+ if (order == cc->search_order) {
+ cc->search_order--;
+ if (cc->search_order < 0)
+ cc->search_order = cc->order - 1;
+ return -1;
+ }
+
+ return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+ unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int nr_scanned = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
+ unsigned long nr_isolated = 0;
+ unsigned long distance;
+ struct page *page = NULL;
+ bool scan_start = false;
+ int order;
+
+ /* Full compaction passes in a negative order */
+ if (cc->order <= 0)
+ return cc->free_pfn;
+
+ /*
+ * If starting the scan, use a deeper search and use the highest
+ * PFN found if a suitable one is not found.
+ */
+ if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+ limit = pageblock_nr_pages >> 1;
+ scan_start = true;
+ }
+
+ /*
+ * Preferred point is in the top quarter of the scan space but take
+ * a pfn from the top half if the search is problematic.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn);
+ low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+ min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+ if (WARN_ON_ONCE(min_pfn > low_pfn))
+ low_pfn = min_pfn;
+
+ /*
+ * Search starts from the last successful isolation order or the next
+ * order to search after a previous failure
+ */
+ cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+ for (order = cc->search_order;
+ !page && order >= 0;
+ order = next_search_order(cc, order)) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ struct page *freepage;
+ unsigned long flags;
+ unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry_reverse(freepage, freelist, lru) {
+ unsigned long pfn;
+
+ order_scanned++;
+ nr_scanned++;
+ pfn = page_to_pfn(freepage);
+
+ if (pfn >= highest)
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
+
+ if (pfn >= low_pfn) {
+ cc->fast_search_fail = 0;
+ cc->search_order = order;
+ page = freepage;
+ break;
+ }
+
+ if (pfn >= min_pfn && pfn > high_pfn) {
+ high_pfn = pfn;
+
+ /* Shorten the scan if a candidate is found */
+ limit >>= 1;
+ }
+
+ if (order_scanned >= limit)
+ break;
+ }
+
+ /* Use a minimum pfn if a preferred one was not found */
+ if (!page && high_pfn) {
+ page = pfn_to_page(high_pfn);
+
+ /* Update freepage for the list reorder below */
+ freepage = page;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_head(freelist, freepage);
+
+ /* Isolate the page if available */
+ if (page) {
+ if (__isolate_free_page(page, order)) {
+ set_page_private(page, order);
+ nr_isolated = 1 << order;
+ cc->nr_freepages += nr_isolated;
+ list_add_tail(&page->lru, &cc->freepages);
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+ } else {
+ /* If isolation fails, abort the search */
+ order = cc->search_order + 1;
+ page = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * Smaller scan on next order so the total scan ig related
+ * to freelist_scan_limit.
+ */
+ if (order_scanned >= limit)
+ limit = min(1U, limit >> 1);
+ }
+
+ if (!page) {
+ cc->fast_search_fail++;
+ if (scan_start) {
+ /*
+ * Use the highest PFN found above min. If one was
+ * not found, be pessimistic for direct compaction
+ * and use the min mark.
+ */
+ if (highest) {
+ page = pfn_to_page(highest);
+ cc->free_pfn = highest;
+ } else {
+ if (cc->direct_compaction && pfn_valid(min_pfn)) {
+ page = pageblock_pfn_to_page(min_pfn,
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
+ cc->zone);
+ cc->free_pfn = min_pfn;
+ }
+ }
+ }
+ }
+
+ if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+ highest -= pageblock_nr_pages;
+ cc->zone->compact_cached_free_pfn = highest;
+ }
+
+ cc->total_free_scanned += nr_scanned;
+ if (!page)
+ return cc->free_pfn;
+
+ low_pfn = page_to_pfn(page);
+ fast_isolate_around(cc, low_pfn);
+ return low_pfn;
+}
+
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct compact_control *cc)
+{
+ struct zone *zone = cc->zone;
+ struct page *page;
+ unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long isolate_start_pfn; /* exact pfn we start at */
+ unsigned long block_end_pfn; /* end of current pageblock */
+ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
+ struct list_head *freelist = &cc->freepages;
+ unsigned int stride;
+
+ /* Try a small search of the free lists for a candidate */
+ isolate_start_pfn = fast_isolate_freepages(cc);
+ if (cc->nr_freepages)
+ goto splitmap;
+
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. For looping we also need
+ * this pfn aligned down to the pageblock boundary, because we do
+ * block_start_pfn -= pageblock_nr_pages in the for loop.
+ * For ending point, take care when isolating in last pageblock of a
+ * zone which ends in the middle of a pageblock.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
+ */
+ isolate_start_pfn = cc->free_pfn;
+ block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
+ block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
+ zone_end_pfn(zone));
+ low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+ stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ for (; block_start_pfn >= low_pfn;
+ block_end_pfn = block_start_pfn,
+ block_start_pfn -= pageblock_nr_pages,
+ isolate_start_pfn = block_start_pfn) {
+ unsigned long nr_isolated;
+
+ /*
+ * This can iterate a massively long zone without finding any
+ * suitable migration targets, so periodically check resched.
+ */
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
+
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
+ if (!page)
+ continue;
+
+ /* Check the block is suitable for migration */
+ if (!suitable_migration_target(cc, page))
+ continue;
+
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from. */
+ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, stride, false);
+
+ /* Update the skip hint if the full pageblock was scanned */
+ if (isolate_start_pfn == block_end_pfn)
+ update_pageblock_skip(cc, page, block_start_pfn);
+
+ /* Are enough freepages isolated? */
+ if (cc->nr_freepages >= cc->nr_migratepages) {
+ if (isolate_start_pfn >= block_end_pfn) {
+ /*
+ * Restart at previous pageblock if more
+ * freepages can be isolated next time.
+ */
+ isolate_start_pfn =
+ block_start_pfn - pageblock_nr_pages;
+ }
+ break;
+ } else if (isolate_start_pfn < block_end_pfn) {
+ /*
+ * If isolation failed early, do not continue
+ * needlessly.
+ */
+ break;
+ }
+
+ /* Adjust stride depending on isolation */
+ if (nr_isolated) {
+ stride = 1;
+ continue;
+ }
+ stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+ }
+
+ /*
+ * Record where the free scanner will restart next time. Either we
+ * broke from the loop and set isolate_start_pfn based on the last
+ * call to isolate_freepages_block(), or we met the migration scanner
+ * and the loop terminated due to isolate_start_pfn < low_pfn
+ */
+ cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(freelist);
+}
+
+/*
+ * This is a migrate-callback that "allocates" freepages by taking pages
+ * from the isolated freelists in the block we are migrating to.
+ */
+static struct page *compaction_alloc(struct page *migratepage,
+ unsigned long data)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ if (list_empty(&cc->freepages)) {
+ isolate_freepages(cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+/*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist. All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+
+ list_add(&page->lru, &cc->freepages);
+ cc->nr_freepages++;
+}
+
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
+/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+#ifdef CONFIG_PREEMPT_RT
+int sysctl_compact_unevictable_allowed __read_mostly = 0;
+#else
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+#endif
+
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+ if (cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ if (!cc->fast_start_pfn)
+ cc->fast_start_pfn = pfn;
+
+ cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline unsigned long
+reinit_migrate_pfn(struct compact_control *cc)
+{
+ if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+ return cc->migrate_pfn;
+
+ cc->migrate_pfn = cc->fast_start_pfn;
+ cc->fast_start_pfn = ULONG_MAX;
+
+ return cc->migrate_pfn;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+ unsigned int limit = freelist_scan_limit(cc);
+ unsigned int nr_scanned = 0;
+ unsigned long distance;
+ unsigned long pfn = cc->migrate_pfn;
+ unsigned long high_pfn;
+ int order;
+ bool found_block = false;
+
+ /* Skip hints are relied on to avoid repeats on the fast search */
+ if (cc->ignore_skip_hint)
+ return pfn;
+
+ /*
+ * If the migrate_pfn is not at the start of a zone or the start
+ * of a pageblock then assume this is a continuation of a previous
+ * scan restarted due to COMPACT_CLUSTER_MAX.
+ */
+ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+ return pfn;
+
+ /*
+ * For smaller orders, just linearly scan as the number of pages
+ * to migrate should be relatively small and does not necessarily
+ * justify freeing up a large block for a small allocation.
+ */
+ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return pfn;
+
+ /*
+ * Only allow kcompactd and direct requests for movable pages to
+ * quickly clear out a MOVABLE pageblock for allocation. This
+ * reduces the risk that a large movable pageblock is freed for
+ * an unmovable/reclaimable small allocation.
+ */
+ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+ return pfn;
+
+ /*
+ * When starting the migration scanner, pick any pageblock within the
+ * first half of the search space. Otherwise try and pick a pageblock
+ * within the first eighth to reduce the chances that a migration
+ * target later becomes a source.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+ if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+ distance >>= 2;
+ high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+ for (order = cc->order - 1;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
+ order--) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ unsigned long flags;
+ struct page *freepage;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(freepage, freelist, lru) {
+ unsigned long free_pfn;
+
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
+ free_pfn = page_to_pfn(freepage);
+ if (free_pfn < high_pfn) {
+ /*
+ * Avoid if skipped recently. Ideally it would
+ * move to the tail but even safe iteration of
+ * the list assumes an entry is deleted, not
+ * reordered.
+ */
+ if (get_pageblock_skip(freepage))
+ continue;
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_tail(freelist, freepage);
+
+ update_fast_start_pfn(cc, free_pfn);
+ pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
+ cc->fast_search_fail = 0;
+ found_block = true;
+ set_pageblock_skip(freepage);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+
+ cc->total_migrate_scanned += nr_scanned;
+
+ /*
+ * If fast scanning failed then use a cached entry for a page block
+ * that had free pages as the basis for starting a linear scan.
+ */
+ if (!found_block) {
+ cc->fast_search_fail++;
+ pfn = reinit_migrate_pfn(cc);
+ }
+ return pfn;
+}
+
+/*
+ * Isolate all pages that can be migrated from the first suitable block,
+ * starting at the block pointed to by the migrate scanner pfn within
+ * compact_control.
+ */
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
+{
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
+ struct page *page;
+ const isolate_mode_t isolate_mode =
+ (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
+ (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+ bool fast_find_block;
+
+ /*
+ * Start at where we last stopped, or beginning of the zone as
+ * initialized by compact_zone(). The first failure will use
+ * the lowest PFN as the starting point for linear scanning.
+ */
+ low_pfn = fast_find_migrateblock(cc);
+ block_start_pfn = pageblock_start_pfn(low_pfn);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+
+ /*
+ * fast_find_migrateblock marks a pageblock skipped so to avoid
+ * the isolation_suitable check below, check whether the fast
+ * search was successful.
+ */
+ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
+
+ /* Only scan within a pageblock boundary */
+ block_end_pfn = pageblock_end_pfn(low_pfn);
+
+ /*
+ * Iterate over whole pageblocks until we find the first suitable.
+ * Do not cross the free scanner.
+ */
+ for (; block_end_pfn <= cc->free_pfn;
+ fast_find_block = false,
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ /*
+ * This can potentially iterate a massively long zone with
+ * many pageblocks unsuitable, so periodically check if we
+ * need to schedule.
+ */
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
+
+ page = pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone);
+ if (!page)
+ continue;
+
+ /*
+ * If isolation recently failed, do not retry. Only check the
+ * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+ * to be visited multiple times. Assume skip was checked
+ * before making it "skip" so other compaction instances do
+ * not scan the same block.
+ */
+ if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ !fast_find_block && !isolation_suitable(cc, page))
+ continue;
+
+ /*
+ * For async compaction, also only scan in MOVABLE blocks
+ * without huge pages. Async compaction is optimistic to see
+ * if the minimum amount of work satisfies the allocation.
+ * The cached PFN is updated as it's possible that all
+ * remaining blocks between source and target are unsuitable
+ * and the compaction scanners fail to meet.
+ */
+ if (!suitable_migration_source(cc, page)) {
+ update_cached_migrate(cc, block_end_pfn);
+ continue;
+ }
+
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
+
+ if (!low_pfn)
+ return ISOLATE_ABORT;
+
+ /*
+ * Either we isolated something and proceed with migration. Or
+ * we failed and compact_zone should decide if we should
+ * continue or not.
+ */
+ break;
+ }
+
+ /* Record where migration scanner will be restarted. */
+ cc->migrate_pfn = low_pfn;
+
+ return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
+}
+
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+ return order == -1;
+}
+
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+ return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ unsigned long score;
+
+ score = zone->present_pages *
+ extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
+{
+ unsigned int score = 0;
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone;
+
+ zone = &pgdat->node_zones[zoneid];
+ score += fragmentation_score_zone(zone);
+ }
+
+ return score;
+}
+
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+ unsigned int wmark_low;
+
+ /*
+ * Cap the low watermak to avoid excessive compaction
+ * activity in case a user sets the proactivess tunable
+ * close to 100 (maximum).
+ */
+ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+ return low ? wmark_low : min(wmark_low + 10, 100U);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+ int wmark_high;
+
+ if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+ return false;
+
+ wmark_high = fragmentation_score_wmark(pgdat, false);
+ return fragmentation_score_node(pgdat) > wmark_high;
+}
+
+static enum compact_result __compact_finished(struct compact_control *cc)
+{
+ unsigned int order;
+ const int migratetype = cc->migratetype;
+ int ret;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (compact_scanners_met(cc)) {
+ /* Let the next compaction start anew. */
+ reset_cached_positions(cc->zone);
+
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kcompactd does not set the
+ * flag itself as the decision to be clear should be directly
+ * based on an allocation request.
+ */
+ if (cc->direct_compaction)
+ cc->zone->compact_blockskip_flush = true;
+
+ if (cc->whole_zone)
+ return COMPACT_COMPLETE;
+ else
+ return COMPACT_PARTIAL_SKIPPED;
+ }
+
+ if (cc->proactive_compaction) {
+ int score, wmark_low;
+ pg_data_t *pgdat;
+
+ pgdat = cc->zone->zone_pgdat;
+ if (kswapd_is_running(pgdat))
+ return COMPACT_PARTIAL_SKIPPED;
+
+ score = fragmentation_score_zone(cc->zone);
+ wmark_low = fragmentation_score_wmark(pgdat, true);
+
+ if (score > wmark_low)
+ ret = COMPACT_CONTINUE;
+ else
+ ret = COMPACT_SUCCESS;
+
+ goto out;
+ }
+
+ if (is_via_compact_memory(cc->order))
+ return COMPACT_CONTINUE;
+
+ /*
+ * Always finish scanning a pageblock to reduce the possibility of
+ * fallbacks in the future. This is particularly important when
+ * migration source is unmovable/reclaimable but it's not worth
+ * special casing.
+ */
+ if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ return COMPACT_CONTINUE;
+
+ /* Direct compactor: Is a suitable page free? */
+ ret = COMPACT_NO_SUITABLE_PAGE;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &cc->zone->free_area[order];
+ bool can_steal;
+
+ /* Job done if page is free of the right migratetype */
+ if (!free_area_empty(area, migratetype))
+ return COMPACT_SUCCESS;
+
+#ifdef CONFIG_CMA
+ /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+ if (migratetype == MIGRATE_MOVABLE &&
+ !free_area_empty(area, MIGRATE_CMA))
+ return COMPACT_SUCCESS;
+#endif
+ /*
+ * Job done if allocation would steal freepages from
+ * other migratetype buddy lists.
+ */
+ if (find_suitable_fallback(area, order, migratetype,
+ true, &can_steal) != -1) {
+
+ /* movable pages are OK in any pageblock */
+ if (migratetype == MIGRATE_MOVABLE)
+ return COMPACT_SUCCESS;
+
+ /*
+ * We are stealing for a non-movable allocation. Make
+ * sure we finish compacting the current pageblock
+ * first so it is as free as possible and we won't
+ * have to steal another one soon. This only applies
+ * to sync compaction, as async compaction operates
+ * on pageblocks of the same migratetype.
+ */
+ if (cc->mode == MIGRATE_ASYNC ||
+ IS_ALIGNED(cc->migrate_pfn,
+ pageblock_nr_pages)) {
+ return COMPACT_SUCCESS;
+ }
+
+ ret = COMPACT_CONTINUE;
+ break;
+ }
+ }
+
+out:
+ if (cc->contended || fatal_signal_pending(current))
+ ret = COMPACT_CONTENDED;
+
+ return ret;
+}
+
+static enum compact_result compact_finished(struct compact_control *cc)
+{
+ int ret;
+
+ ret = __compact_finished(cc);
+ trace_mm_compaction_finished(cc->zone, cc->order, ret);
+ if (ret == COMPACT_NO_SUITABLE_PAGE)
+ ret = COMPACT_CONTINUE;
+
+ return ret;
+}
+
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_SUCCESS - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int highest_zoneidx,
+ unsigned long wmark_target)
+{
+ unsigned long watermark;
+
+ if (is_via_compact_memory(order))
+ return COMPACT_CONTINUE;
+
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ /*
+ * If watermarks for high-order allocation are already met, there
+ * should be no need for compaction at all.
+ */
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+ alloc_flags))
+ return COMPACT_SUCCESS;
+
+ /*
+ * Watermarks for order-0 must be met for compaction to be able to
+ * isolate free pages for migration targets. This means that the
+ * watermark and alloc_flags have to match, or be more pessimistic than
+ * the check in __isolate_free_page(). We don't use the direct
+ * compactor's alloc_flags, as they are not relevant for freepage
+ * isolation. We however do use the direct compactor's highest_zoneidx
+ * to skip over zones where lowmem reserves would prevent allocation
+ * even if compaction succeeds.
+ * For costly orders, we require low watermark instead of min for
+ * compaction to proceed to increase its chances.
+ * ALLOC_CMA is used, as pages in CMA pageblocks are considered
+ * suitable migration targets
+ */
+ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ low_wmark_pages(zone) : min_wmark_pages(zone);
+ watermark += compact_gap(order);
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+ ALLOC_CMA, wmark_target))
+ return COMPACT_SKIPPED;
+
+ return COMPACT_CONTINUE;
+}
+
+enum compact_result compaction_suitable(struct zone *zone, int order,
+ unsigned int alloc_flags,
+ int highest_zoneidx)
+{
+ enum compact_result ret;
+ int fragindex;
+
+ ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
+ zone_page_state(zone, NR_FREE_PAGES));
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1000 would imply allocations might succeed depending on
+ * watermarks, but we already failed the high-order watermark check
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation. Also
+ * ignore fragindex for non-costly orders where the alternative to
+ * a successful reclaim/compaction is OOM. Fragindex and the
+ * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
+ * excessive compaction for costly orders, but it should not be at the
+ * expense of system stability.
+ */
+ if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ ret = COMPACT_NOT_SUITABLE_ZONE;
+ }
+
+ trace_mm_compaction_suitable(zone, order, ret);
+ if (ret == COMPACT_NOT_SUITABLE_ZONE)
+ ret = COMPACT_SKIPPED;
+
+ return ret;
+}
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+ int alloc_flags)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure at least one zone would pass __compaction_suitable if we continue
+ * retrying the reclaim.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
+ unsigned long available;
+ enum compact_result compact_result;
+
+ /*
+ * Do not consider all the reclaimable memory because we do not
+ * want to trash just for a single high order allocation which
+ * is even not guaranteed to appear even if __compaction_suitable
+ * is happy about the watermark check.
+ */
+ available = zone_reclaimable_pages(zone) / order;
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+ compact_result = __compaction_suitable(zone, order, alloc_flags,
+ ac->highest_zoneidx, available);
+ if (compact_result != COMPACT_SKIPPED)
+ return true;
+ }
+
+ return false;
+}
+
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
+{
+ enum compact_result ret;
+ unsigned long start_pfn = cc->zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(cc->zone);
+ unsigned long last_migrated_pfn;
+ const bool sync = cc->mode != MIGRATE_ASYNC;
+ bool update_cached;
+
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
+ cc->migratetype = gfp_migratetype(cc->gfp_mask);
+ ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
+ cc->highest_zoneidx);
+ /* Compaction is likely to fail */
+ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
+ return ret;
+
+ /* huh, compaction_suitable is returning something unexpected */
+ VM_BUG_ON(ret != COMPACT_CONTINUE);
+
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred.
+ */
+ if (compaction_restarting(cc->zone, cc->order))
+ __reset_isolation_suitable(cc->zone);
+
+ /*
+ * Setup to move all movable pages to the end of the zone. Used cached
+ * information on where the scanners should start (unless we explicitly
+ * want to compact the whole zone), but check that it is initialised
+ * by ensuring the values are within zone boundaries.
+ */
+ cc->fast_start_pfn = 0;
+ if (cc->whole_zone) {
+ cc->migrate_pfn = start_pfn;
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ } else {
+ cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = cc->zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
+ cc->zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ }
+
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
+ cc->whole_zone = true;
+ }
+
+ last_migrated_pfn = 0;
+
+ /*
+ * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+ * the basis that some migrations will fail in ASYNC mode. However,
+ * if the cached PFNs match and pageblocks are skipped due to having
+ * no isolation candidates, then the sync state does not matter.
+ * Until a pageblock with isolation candidates is found, keep the
+ * cached PFNs in sync to avoid revisiting the same blocks.
+ */
+ update_cached = !sync &&
+ cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
+
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync);
+
+ migrate_prep_local();
+
+ while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
+ int err;
+ unsigned long start_pfn = cc->migrate_pfn;
+
+ /*
+ * Avoid multiple rescans which can happen if a page cannot be
+ * isolated (dirty/writeback in async mode) or if the migrated
+ * pages are being allocated before the pageblock is cleared.
+ * The first rescan will capture the entire pageblock for
+ * migration. If it fails, it'll be marked skip and scanning
+ * will proceed as normal.
+ */
+ cc->rescan = false;
+ if (pageblock_start_pfn(last_migrated_pfn) ==
+ pageblock_start_pfn(start_pfn)) {
+ cc->rescan = true;
+ }
+
+ switch (isolate_migratepages(cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_CONTENDED;
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ goto out;
+ case ISOLATE_NONE:
+ if (update_cached) {
+ cc->zone->compact_cached_migrate_pfn[1] =
+ cc->zone->compact_cached_migrate_pfn[0];
+ }
+
+ /*
+ * We haven't isolated and migrated anything, but
+ * there might still be unflushed migrations from
+ * previous cc->order aligned block.
+ */
+ goto check_drain;
+ case ISOLATE_SUCCESS:
+ update_cached = false;
+ last_migrated_pfn = start_pfn;
+ ;
+ }
+
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
+ compaction_free, (unsigned long)cc, cc->mode,
+ MR_COMPACTION);
+
+ trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+ &cc->migratepages);
+
+ /* All pages were either migrated or will be released */
+ cc->nr_migratepages = 0;
+ if (err) {
+ putback_movable_pages(&cc->migratepages);
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && !compact_scanners_met(cc)) {
+ ret = COMPACT_CONTENDED;
+ goto out;
+ }
+ /*
+ * We failed to migrate at least one page in the current
+ * order-aligned block, so skip the rest of it.
+ */
+ if (cc->direct_compaction &&
+ (cc->mode == MIGRATE_ASYNC)) {
+ cc->migrate_pfn = block_end_pfn(
+ cc->migrate_pfn - 1, cc->order);
+ /* Draining pcplists is useless in this case */
+ last_migrated_pfn = 0;
+ }
+ }
+
+check_drain:
+ /*
+ * Has the migration scanner moved away from the previous
+ * cc->order aligned block where we migrated from? If yes,
+ * flush the pages that were freed, so that they can merge and
+ * compact_finished() can detect immediately if allocation
+ * would succeed.
+ */
+ if (cc->order > 0 && last_migrated_pfn) {
+ unsigned long current_block_start =
+ block_start_pfn(cc->migrate_pfn, cc->order);
+
+ if (last_migrated_pfn < current_block_start) {
+ lru_add_drain_cpu_zone(cc->zone);
+ /* No more flushing until we migrate again */
+ last_migrated_pfn = 0;
+ }
+ }
+
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
+ }
+
+out:
+ /*
+ * Release free pages and update where the free scanner should restart,
+ * so we don't leave any returned pages behind in the next attempt.
+ */
+ if (cc->nr_freepages > 0) {
+ unsigned long free_pfn = release_freepages(&cc->freepages);
+
+ cc->nr_freepages = 0;
+ VM_BUG_ON(free_pfn == 0);
+ /* The cached pfn is always the first in a pageblock */
+ free_pfn = pageblock_start_pfn(free_pfn);
+ /*
+ * Only go back, not forward. The cached pfn might have been
+ * already reset to zone end in compact_finished()
+ */
+ if (free_pfn > cc->zone->compact_cached_free_pfn)
+ cc->zone->compact_cached_free_pfn = free_pfn;
+ }
+
+ count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
+ count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
+
+ trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync, ret);
+
+ return ret;
+}
+
+static enum compact_result compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, enum compact_priority prio,
+ unsigned int alloc_flags, int highest_zoneidx,
+ struct page **capture)
+{
+ enum compact_result ret;
+ struct compact_control cc = {
+ .order = order,
+ .search_order = order,
+ .gfp_mask = gfp_mask,
+ .zone = zone,
+ .mode = (prio == COMPACT_PRIO_ASYNC) ?
+ MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
+ .alloc_flags = alloc_flags,
+ .highest_zoneidx = highest_zoneidx,
+ .direct_compaction = true,
+ .whole_zone = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
+ .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
+ };
+ struct capture_control capc = {
+ .cc = &cc,
+ .page = NULL,
+ };
+
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
+
+ ret = compact_zone(&cc, &capc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
+
+ return ret;
+}
+
+int sysctl_extfrag_threshold = 500;
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
+ * @prio: Determines how hard direct compaction should try to succeed
+ * @capture: Pointer to free page created by compaction will be stored here
+ *
+ * This is the main entry point for direct page compaction.
+ */
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum compact_priority prio, struct page **capture)
+{
+ int may_perform_io = gfp_mask & __GFP_IO;
+ struct zoneref *z;
+ struct zone *zone;
+ enum compact_result rc = COMPACT_SKIPPED;
+
+ /*
+ * Check if the GFP flags allow compaction - GFP_NOIO is really
+ * tricky context because the migration might require IO
+ */
+ if (!may_perform_io)
+ return COMPACT_SKIPPED;
+
+ trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
+
+ /* Compact each zone in the list */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
+ enum compact_result status;
+
+ if (prio > MIN_COMPACT_PRIORITY
+ && compaction_deferred(zone, order)) {
+ rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
+ continue;
+ }
+
+ status = compact_zone_order(zone, order, gfp_mask, prio,
+ alloc_flags, ac->highest_zoneidx, capture);
+ rc = max(status, rc);
+
+ /* The allocation should succeed, stop compacting */
+ if (status == COMPACT_SUCCESS) {
+ /*
+ * We think the allocation will succeed in this zone,
+ * but it is not certain, hence the false. The caller
+ * will repeat this with true if allocation indeed
+ * succeeds in this zone.
+ */
+ compaction_defer_reset(zone, order, false);
+
+ break;
+ }
+
+ if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
+ status == COMPACT_PARTIAL_SKIPPED))
+ /*
+ * We think that allocation won't succeed in this zone
+ * so we defer compaction there. If it ends up
+ * succeeding after all, it will be reset.
+ */
+ defer_compaction(zone, order);
+
+ /*
+ * We might have stopped compacting due to need_resched() in
+ * async compaction, or due to a fatal signal detected. In that
+ * case do not try further zones
+ */
+ if ((prio == COMPACT_PRIO_ASYNC && need_resched())
+ || fatal_signal_pending(current))
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ .proactive_compaction = true,
+ };
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
+
+/* Compact all zones within a node */
+static void compact_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
+
+/* Compact all nodes in the system */
+static void compact_nodes(void)
+{
+ int nid;
+
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ for_each_online_node(nid)
+ compact_node(nid);
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+
+/*
+ * This is the entry point for compacting all nodes via
+ * /proc/sys/vm/compact_memory
+ */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ compact_nodes();
+
+ return 0;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t sysfs_compact_node(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+
+ if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ compact_node(nid);
+ }
+
+ return count;
+}
+static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+
+int compaction_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_compact);
+}
+
+void compaction_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_compact);
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
+
+ for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ highest_zoneidx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .search_order = pgdat->kcompactd_max_order,
+ .highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = false,
+ .gfp_mask = GFP_KERNEL,
+ };
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.highest_zoneidx);
+ count_compact_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ if (kthread_should_stop())
+ return;
+
+ cc.zone = zone;
+ status = compact_zone(&cc, NULL);
+
+ if (status == COMPACT_SUCCESS) {
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
+ /*
+ * Buddy pages may become stranded on pcps that could
+ * otherwise coalesce on the zone's free area for
+ * order >= cc.order. This is ratelimited by the
+ * upcoming deferral.
+ */
+ drain_all_pages(zone);
+
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/highest_zoneidx in case it was higher/tighter
+ * than our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
+
+ /*
+ * Pairs with implicit barrier in wait_event_freezable()
+ * such that wakeups are not missed.
+ */
+ if (!wq_has_sleeper(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ highest_zoneidx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ unsigned int proactive_defer = 0;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ unsigned long pflags;
+
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat),
+ msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+ psi_memstall_enter(&pflags);
+ kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
+ continue;
+ }
+
+ /* kcompactd wait timeout */
+ if (should_proactive_compact_node(pgdat)) {
+ unsigned int prev_score, score;
+
+ if (proactive_defer) {
+ proactive_defer--;
+ continue;
+ }
+ prev_score = fragmentation_score_node(pgdat);
+ proactive_compact_node(pgdat);
+ score = fragmentation_score_node(pgdat);
+ /*
+ * Defer proactive compaction if the fragmentation
+ * score did not go down i.e. no progress made.
+ */
+ proactive_defer = score < prev_score ?
+ 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int kcompactd_cpu_online(unsigned int cpu)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ return 0;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/compaction:online",
+ kcompactd_cpu_online, NULL);
+ if (ret < 0) {
+ pr_err("kcompactd: failed to register hotplug callbacks.\n");
+ return ret;
+ }
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
+#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug.c b/mm/debug.c
new file mode 100644
index 000000000..ccca576b2
--- /dev/null
+++ b/mm/debug.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/debug.c
+ *
+ * mm/ specific debug routines.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/trace_events.h>
+#include <linux/memcontrol.h>
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+#include <linux/ctype.h>
+
+#include "internal.h"
+
+const char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
+};
+
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
+
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
+
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
+
+void __dump_page(struct page *page, const char *reason)
+{
+ struct page *head = compound_head(page);
+ struct address_space *mapping;
+ bool page_poisoned = PagePoisoned(page);
+ bool compound = PageCompound(page);
+ /*
+ * Accessing the pageblock without the zone lock. It could change to
+ * "isolate" again in the meantime, but since we are just dumping the
+ * state for debugging, it should be fine to accept a bit of
+ * inaccuracy here due to racing.
+ */
+ bool page_cma = is_migrate_cma_page(page);
+ int mapcount;
+ char *type = "";
+
+ /*
+ * If struct page is poisoned don't access Page*() functions as that
+ * leads to recursive loop. Page*() check for poisoned pages, and calls
+ * dump_page() when detected.
+ */
+ if (page_poisoned) {
+ pr_warn("page:%px is uninitialized and poisoned", page);
+ goto hex_only;
+ }
+
+ if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
+ /*
+ * Corrupt page, so we cannot call page_mapping. Instead, do a
+ * safe subset of the steps that page_mapping() does. Caution:
+ * this will be misleading for tail pages, PageSwapCache pages,
+ * and potentially other situations. (See the page_mapping()
+ * implementation for what's missing here.)
+ */
+ unsigned long tmp = (unsigned long)page->mapping;
+
+ if (tmp & PAGE_MAPPING_ANON)
+ mapping = NULL;
+ else
+ mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
+ head = page;
+ compound = false;
+ } else {
+ mapping = page_mapping(page);
+ }
+
+ /*
+ * Avoid VM_BUG_ON() in page_mapcount().
+ * page->_mapcount space in struct page is used by sl[aou]b pages to
+ * encode own info.
+ */
+ mapcount = PageSlab(head) ? 0 : page_mapcount(page);
+
+ pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+ page, page_ref_count(head), mapcount, mapping,
+ page_to_pgoff(page), page_to_pfn(page));
+ if (compound) {
+ if (hpage_pincount_available(page)) {
+ pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ head, compound_order(head),
+ head_compound_mapcount(head),
+ head_compound_pincount(head));
+ } else {
+ pr_warn("head:%p order:%u compound_mapcount:%d\n",
+ head, compound_order(head),
+ head_compound_mapcount(head));
+ }
+ }
+ if (PageKsm(page))
+ type = "ksm ";
+ else if (PageAnon(page))
+ type = "anon ";
+ else if (mapping) {
+ struct inode *host;
+ const struct address_space_operations *a_ops;
+ struct hlist_node *dentry_first;
+ struct dentry *dentry_ptr;
+ struct dentry dentry;
+ unsigned long ino;
+
+ /*
+ * mapping can be invalid pointer and we don't want to crash
+ * accessing it, so probe everything depending on it carefully
+ */
+ if (get_kernel_nofault(host, &mapping->host) ||
+ get_kernel_nofault(a_ops, &mapping->a_ops)) {
+ pr_warn("failed to read mapping contents, not a valid kernel address?\n");
+ goto out_mapping;
+ }
+
+ if (!host) {
+ pr_warn("aops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
+ get_kernel_nofault(ino, &host->i_ino)) {
+ pr_warn("aops:%ps with invalid host inode %px\n",
+ a_ops, host);
+ goto out_mapping;
+ }
+
+ if (!dentry_first) {
+ pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
+ goto out_mapping;
+ }
+
+ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
+ if (get_kernel_nofault(dentry, dentry_ptr)) {
+ pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
+ a_ops, ino, dentry_ptr);
+ } else {
+ /*
+ * if dentry is corrupted, the %pd handler may still
+ * crash, but it's unlikely that we reach here with a
+ * corrupted struct page
+ */
+ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
+ a_ops, ino, &dentry);
+ }
+ }
+out_mapping:
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
+ page_cma ? " CMA" : "");
+
+hex_only:
+ print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
+ sizeof(unsigned long), page,
+ sizeof(struct page), false);
+ if (head != page)
+ print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
+ sizeof(unsigned long), head,
+ sizeof(struct page), false);
+
+ if (reason)
+ pr_warn("page dumped because: %s\n", reason);
+
+#ifdef CONFIG_MEMCG
+ if (!page_poisoned && page->mem_cgroup)
+ pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
+#endif
+}
+
+void dump_page(struct page *page, const char *reason)
+{
+ __dump_page(page, reason);
+ dump_page_owner(page);
+}
+EXPORT_SYMBOL(dump_page);
+
+#ifdef CONFIG_DEBUG_VM
+
+void dump_vma(const struct vm_area_struct *vma)
+{
+ pr_emerg("vma %px start %px end %px\n"
+ "next %px prev %px mm %px\n"
+ "prot %lx anon_vma %px vm_ops %px\n"
+ "pgoff %lx file %px private_data %px\n"
+ "flags: %#lx(%pGv)\n",
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
+ vma->vm_prev, vma->vm_mm,
+ (unsigned long)pgprot_val(vma->vm_page_prot),
+ vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
+}
+EXPORT_SYMBOL(dump_vma);
+
+void dump_mm(const struct mm_struct *mm)
+{
+ pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n"
+#ifdef CONFIG_MMU
+ "get_unmapped_area %px\n"
+#endif
+ "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
+ "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
+ "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
+ "start_code %lx end_code %lx start_data %lx end_data %lx\n"
+ "start_brk %lx brk %lx start_stack %lx\n"
+ "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
+ "binfmt %px flags %lx core_state %px\n"
+#ifdef CONFIG_AIO
+ "ioctx_table %px\n"
+#endif
+#ifdef CONFIG_MEMCG
+ "owner %px "
+#endif
+ "exe_file %px\n"
+#ifdef CONFIG_MMU_NOTIFIER
+ "notifier_subscriptions %px\n"
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
+#endif
+ "tlb_flush_pending %d\n"
+ "def_flags: %#lx(%pGv)\n",
+
+ mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size,
+#ifdef CONFIG_MMU
+ mm->get_unmapped_area,
+#endif
+ mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->pgd, atomic_read(&mm->mm_users),
+ atomic_read(&mm->mm_count),
+ mm_pgtables_bytes(mm),
+ mm->map_count,
+ mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
+ (u64)atomic64_read(&mm->pinned_vm),
+ mm->data_vm, mm->exec_vm, mm->stack_vm,
+ mm->start_code, mm->end_code, mm->start_data, mm->end_data,
+ mm->start_brk, mm->brk, mm->start_stack,
+ mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
+ mm->binfmt, mm->flags, mm->core_state,
+#ifdef CONFIG_AIO
+ mm->ioctx_table,
+#endif
+#ifdef CONFIG_MEMCG
+ mm->owner,
+#endif
+ mm->exe_file,
+#ifdef CONFIG_MMU_NOTIFIER
+ mm->notifier_subscriptions,
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
+#endif
+ atomic_read(&mm->tlb_flush_pending),
+ mm->def_flags, &mm->def_flags
+ );
+}
+
+static bool page_init_poisoning __read_mostly = true;
+
+static int __init setup_vm_debug(char *str)
+{
+ bool __page_init_poisoning = true;
+
+ /*
+ * Calling vm_debug with no arguments is equivalent to requesting
+ * to enable all debugging options we can control.
+ */
+ if (*str++ != '=' || !*str)
+ goto out;
+
+ __page_init_poisoning = false;
+ if (*str == '-')
+ goto out;
+
+ while (*str) {
+ switch (tolower(*str)) {
+ case'p':
+ __page_init_poisoning = true;
+ break;
+ default:
+ pr_err("vm_debug option '%c' unknown. skipped\n",
+ *str);
+ }
+
+ str++;
+ }
+out:
+ if (page_init_poisoning && !__page_init_poisoning)
+ pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
+
+ page_init_poisoning = __page_init_poisoning;
+
+ return 1;
+}
+__setup("vm_debug", setup_vm_debug);
+
+void page_init_poison(struct page *page, size_t size)
+{
+ if (page_init_poisoning)
+ memset(page, PAGE_POISON_PATTERN, size);
+}
+EXPORT_SYMBOL_GPL(page_init_poison);
+#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c
new file mode 100644
index 000000000..f3b2c9d3e
--- /dev/null
+++ b/mm/debug_page_ref.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_ref.h>
+
+void __page_ref_set(struct page *page, int v)
+{
+ trace_page_ref_set(page, v);
+}
+EXPORT_SYMBOL(__page_ref_set);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_set);
+
+void __page_ref_mod(struct page *page, int v)
+{
+ trace_page_ref_mod(page, v);
+}
+EXPORT_SYMBOL(__page_ref_mod);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod);
+
+void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_test(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_test);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test);
+
+void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_return(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_return);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return);
+
+void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+ trace_page_ref_mod_unless(page, v, u);
+}
+EXPORT_SYMBOL(__page_ref_mod_unless);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless);
+
+void __page_ref_freeze(struct page *page, int v, int ret)
+{
+ trace_page_ref_freeze(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_freeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze);
+
+void __page_ref_unfreeze(struct page *page, int v)
+{
+ trace_page_ref_unfreeze(page, v);
+}
+EXPORT_SYMBOL(__page_ref_unfreeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
new file mode 100644
index 000000000..d6fbf28eb
--- /dev/null
+++ b/mm/debug_vm_pgtable.c
@@ -0,0 +1,1147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This kernel test validates architecture page table helpers and
+ * accessors and helps in verifying their continued compliance with
+ * expected generic MM semantics.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
+
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include <linux/printk.h>
+#include <linux/pgtable.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/start_kernel.h>
+#include <linux/sched/mm.h>
+#include <linux/io.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
+
+#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
+
+/*
+ * On s390 platform, the lower 4 bits are used to identify given page table
+ * entry type. But these bits might affect the ability to clear entries with
+ * pxx_clear() because of how dynamic page table folding works on s390. So
+ * while loading up the entries do not change the lower 4 bits. It does not
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
+ */
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
+#define RANDOM_NZVALUE GENMASK(7, 0)
+
+static void __init pte_basic_tests(unsigned long pfn, int idx)
+{
+ pgprot_t prot = protection_map[idx];
+ pte_t pte = pfn_pte(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
+
+ pr_debug("Validating PTE basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pte() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pte_dirty(pte_wrprotect(pte)));
+
+ WARN_ON(!pte_same(pte, pte));
+ WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
+ WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
+ WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
+ WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
+}
+
+static void __init pte_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pte_t *ptep,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ */
+
+ pr_debug("Validating PTE advanced\n");
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_write(pte));
+ ptep_get_and_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pfn_pte(pfn, prot);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ pte = pte_mkwrite(pte);
+ pte = pte_mkdirty(pte);
+ ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+ ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pfn_pte(pfn, prot);
+ pte = pte_mkyoung(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_test_and_clear_young(vma, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_young(pte));
+
+ ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PTE saved write\n");
+ WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+ WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_basic_tests(unsigned long pfn, int idx)
+{
+ pgprot_t prot = protection_map[idx];
+ unsigned long val = idx, *ptr = &val;
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD basic (%pGv)\n", ptr);
+ pmd = pfn_pmd(pfn, prot);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pmd() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd)));
+
+
+ WARN_ON(!pmd_same(pmd, pmd));
+ WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
+ WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
+ WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pmd_bad().
+ */
+ WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
+}
+
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot, pgtable_t pgtable)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD advanced\n");
+ /* Align the address wrt HPAGE_PMD_SIZE */
+ vaddr &= HPAGE_PMD_MASK;
+
+ pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_set_wrprotect(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_write(pmd));
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_mkclean(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkdirty(pmd);
+ pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+ pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ pmd = pmd_mkyoung(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_test_and_clear_young(vma, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD leaf\n");
+ pmd = pfn_pmd(pfn, prot);
+
+ /*
+ * PMD based THP is a leaf entry.
+ */
+ pmd = pmd_mkhuge(pmd);
+ WARN_ON(!pmd_leaf(pmd));
+}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!arch_ioremap_pmd_supported())
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD saved write\n");
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+ WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
+{
+ pgprot_t prot = protection_map[idx];
+ unsigned long val = idx, *ptr = &val;
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD basic (%pGv)\n", ptr);
+ pud = pfn_pud(pfn, prot);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pud() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pud_dirty(pud_wrprotect(pud)));
+
+ WARN_ON(!pud_same(pud, pud));
+ WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud))));
+ WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud))));
+ WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
+ WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
+ WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+ WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
+ WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pud_bad().
+ */
+ WARN_ON(!pud_bad(pud_mkhuge(pud)));
+}
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD advanced\n");
+ /* Align the address wrt HPAGE_PUD_SIZE */
+ vaddr &= HPAGE_PUD_MASK;
+
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_set_wrprotect(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+ pud = pfn_pud(pfn, prot);
+ pud = pud_wrprotect(pud);
+ pud = pud_mkclean(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pud = pud_mkwrite(pud);
+ pud = pud_mkdirty(pud);
+ pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pfn_pud(pfn, prot);
+ pud = pud_mkyoung(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_test_and_clear_young(vma, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD leaf\n");
+ pud = pfn_pud(pfn, prot);
+ /*
+ * PUD based THP is a leaf entry.
+ */
+ pud = pud_mkhuge(pud);
+ WARN_ON(!pud_leaf(pud));
+}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!arch_ioremap_pud_supported())
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot, pgtable_t pgtable)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ p4d_t p4d;
+
+ pr_debug("Validating P4D basic\n");
+ memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
+ WARN_ON(!p4d_same(p4d, p4d));
+}
+
+static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pgd_t pgd;
+
+ pr_debug("Validating PGD basic\n");
+ memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
+ WARN_ON(!pgd_same(pgd, pgd));
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+{
+ pud_t pud = READ_ONCE(*pudp);
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pr_debug("Validating PUD clear\n");
+ pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pudp, pud);
+ pud_clear(pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+ pud_t pud;
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pr_debug("Validating PUD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pud_bad().
+ */
+ pud_populate(mm, pudp, pmdp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_bad(pud));
+}
+#else /* !__PAGETABLE_PUD_FOLDED */
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+}
+#endif /* PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_P4D_FOLDED
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+{
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ if (mm_pud_folded(mm))
+ return;
+
+ pr_debug("Validating P4D clear\n");
+ p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
+ WRITE_ONCE(*p4dp, p4d);
+ p4d_clear(p4dp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(!p4d_none(p4d));
+}
+
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+ p4d_t p4d;
+
+ if (mm_pud_folded(mm))
+ return;
+
+ pr_debug("Validating P4D populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as p4d_bad().
+ */
+ pud_clear(pudp);
+ p4d_clear(p4dp);
+ p4d_populate(mm, p4dp, pudp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(p4d_bad(p4d));
+}
+
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pr_debug("Validating PGD clear\n");
+ pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pgdp, pgd);
+ pgd_clear(pgdp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(!pgd_none(pgd));
+}
+
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+ pgd_t pgd;
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pr_debug("Validating PGD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pgd_bad().
+ */
+ p4d_clear(p4dp);
+ pgd_clear(pgdp);
+ pgd_populate(mm, pgdp, p4dp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(pgd_bad(pgd));
+}
+#else /* !__PAGETABLE_P4D_FOLDED */
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+}
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+}
+#endif /* PAGETABLE_P4D_FOLDED */
+
+static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE clear\n");
+#ifndef CONFIG_RISCV
+ pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+#endif
+ set_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+}
+
+static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ pr_debug("Validating PMD clear\n");
+ pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pmdp, pmd);
+ pmd_clear(pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pmd_t pmd;
+
+ pr_debug("Validating PMD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pmd_bad().
+ */
+ pmd_populate(mm, pmdp, pgtable);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_bad(pmd));
+}
+
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+ return;
+
+ pr_debug("Validating PTE special\n");
+ WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PTE protnone\n");
+ WARN_ON(!pte_protnone(pte));
+ WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD protnone\n");
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ WARN_ON(!pmd_protnone(pmd));
+ WARN_ON(!pmd_present(pmd));
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE devmap\n");
+ WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD devmap\n");
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD devmap\n");
+ pud = pfn_pud(pfn, prot);
+ WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE soft dirty\n");
+ WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+ WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE swap soft dirty\n");
+ WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+ WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD soft dirty\n");
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+ WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+ !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+ return;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD swap soft dirty\n");
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+ WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pte_t pte;
+
+ pr_debug("Validating PTE swap\n");
+ pte = pfn_pte(pfn, prot);
+ swp = __pte_to_swp_entry(pte);
+ pte = __swp_entry_to_pte(swp);
+ WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD swap\n");
+ pmd = pfn_pmd(pfn, prot);
+ swp = __pmd_to_swp_entry(pmd);
+ pmd = __swp_entry_to_pmd(swp);
+ WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+ struct page *page;
+ swp_entry_t swp;
+
+ if (!IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ pr_debug("Validating swap migration\n");
+ /*
+ * swap_migration_tests() requires a dedicated page as it needs to
+ * be locked before creating a migration entry from it. Locking the
+ * page that actually maps kernel text ('start_kernel') can be real
+ * problematic. Lets allocate a dedicated page explicitly for this
+ * purpose that will be freed subsequently.
+ */
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_err("page allocation failed\n");
+ return;
+ }
+
+ /*
+ * make_migration_entry() expects given page to be
+ * locked, otherwise it stumbles upon a BUG_ON().
+ */
+ __SetPageLocked(page);
+ swp = make_migration_entry(page, 1);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(!is_write_migration_entry(swp));
+
+ make_migration_entry_read(&swp);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+
+ swp = make_migration_entry(page, 0);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+ __ClearPageLocked(page);
+ __free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ struct page *page;
+ pte_t pte;
+
+ pr_debug("Validating HugeTLB basic\n");
+ /*
+ * Accessing the page associated with the pfn is safe here,
+ * as it was previously derived from a real kernel symbol.
+ */
+ page = pfn_to_page(pfn);
+ pte = mk_huge_pte(page, prot);
+
+ WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+ WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+ WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD based THP\n");
+ /*
+ * pmd_trans_huge() and pmd_present() must return positive after
+ * MMU invalidation with pmd_mkinvalid(). This behavior is an
+ * optimization for transparent huge page. pmd_trans_huge() must
+ * be true if pmd_page() returns a valid THP to avoid taking the
+ * pmd_lock when others walk over non transhuge pmds (i.e. there
+ * are no THP allocated). Especially when splitting a THP and
+ * removing the present bit from the pmd, pmd_trans_huge() still
+ * needs to return true. pmd_present() should be true whenever
+ * pmd_trans_huge() returns true.
+ */
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+ WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+ WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD based THP\n");
+ pud = pfn_pud(pfn, prot);
+ WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+ /*
+ * pud_mkinvalid() has been dropped for now. Enable back
+ * these tests when it comes back with a modified pud_present().
+ *
+ * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+ * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+ */
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static unsigned long __init get_random_vaddr(void)
+{
+ unsigned long random_vaddr, random_pages, total_user_pages;
+
+ total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE;
+
+ random_pages = get_random_long() % total_user_pages;
+ random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE;
+
+ return random_vaddr;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ pgd_t *pgdp;
+ p4d_t *p4dp, *saved_p4dp;
+ pud_t *pudp, *saved_pudp;
+ pmd_t *pmdp, *saved_pmdp, pmd;
+ pte_t *ptep;
+ pgtable_t saved_ptep;
+ pgprot_t prot, protnone;
+ phys_addr_t paddr;
+ unsigned long vaddr, pte_aligned, pmd_aligned;
+ unsigned long pud_aligned, p4d_aligned, pgd_aligned;
+ spinlock_t *ptl = NULL;
+ int idx;
+
+ pr_info("Validating architecture page table helpers\n");
+ prot = vm_get_page_prot(VMFLAGS);
+ vaddr = get_random_vaddr();
+ mm = mm_alloc();
+ if (!mm) {
+ pr_err("mm_struct allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * __P000 (or even __S000) will help create page table entries with
+ * PROT_NONE permission as required for pxx_protnone_tests().
+ */
+ protnone = __P000;
+
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ pr_err("vma allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * PFN for mapping at PTE level is determined from a standard kernel
+ * text symbol. But pfns for higher page table levels are derived by
+ * masking lower bits of this real pfn. These derived pfns might not
+ * exist on the platform but that does not really matter as pfn_pxx()
+ * helpers will still create appropriate entries for the test. This
+ * helps avoid large memory block allocations to be used for mapping
+ * at higher page table levels.
+ */
+ paddr = __pa_symbol(&start_kernel);
+
+ pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
+ pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
+ pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
+ p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
+ pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
+ WARN_ON(!pfn_valid(pte_aligned));
+
+ pgdp = pgd_offset(mm, vaddr);
+ p4dp = p4d_alloc(mm, pgdp, vaddr);
+ pudp = pud_alloc(mm, p4dp, vaddr);
+ pmdp = pmd_alloc(mm, pudp, vaddr);
+ /*
+ * Allocate pgtable_t
+ */
+ if (pte_alloc(mm, pmdp)) {
+ pr_err("pgtable allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * Save all the page table page addresses as the page table
+ * entries will be used for testing with random or garbage
+ * values. These saved addresses will be used for freeing
+ * page table pages.
+ */
+ pmd = READ_ONCE(*pmdp);
+ saved_p4dp = p4d_offset(pgdp, 0UL);
+ saved_pudp = pud_offset(p4dp, 0UL);
+ saved_pmdp = pmd_offset(pudp, 0UL);
+ saved_ptep = pmd_pgtable(pmd);
+
+ /*
+ * Iterate over the protection_map[] to make sure that all
+ * the basic page table transformation validations just hold
+ * true irrespective of the starting protection value for a
+ * given page table entry.
+ */
+ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
+ pte_basic_tests(pte_aligned, idx);
+ pmd_basic_tests(pmd_aligned, idx);
+ pud_basic_tests(mm, pud_aligned, idx);
+ }
+
+ /*
+ * Both P4D and PGD level tests are very basic which do not
+ * involve creating page table entries from the protection
+ * value and the given pfn. Hence just keep them out from
+ * the above iteration for now to save some test execution
+ * time.
+ */
+ p4d_basic_tests(p4d_aligned, prot);
+ pgd_basic_tests(pgd_aligned, prot);
+
+ pmd_leaf_tests(pmd_aligned, prot);
+ pud_leaf_tests(pud_aligned, prot);
+
+ pte_savedwrite_tests(pte_aligned, protnone);
+ pmd_savedwrite_tests(pmd_aligned, protnone);
+
+ pte_special_tests(pte_aligned, prot);
+ pte_protnone_tests(pte_aligned, protnone);
+ pmd_protnone_tests(pmd_aligned, protnone);
+
+ pte_devmap_tests(pte_aligned, prot);
+ pmd_devmap_tests(pmd_aligned, prot);
+ pud_devmap_tests(pud_aligned, prot);
+
+ pte_soft_dirty_tests(pte_aligned, prot);
+ pmd_soft_dirty_tests(pmd_aligned, prot);
+ pte_swap_soft_dirty_tests(pte_aligned, prot);
+ pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+ pte_swap_tests(pte_aligned, prot);
+ pmd_swap_tests(pmd_aligned, prot);
+
+ swap_migration_tests();
+
+ pmd_thp_tests(pmd_aligned, prot);
+ pud_thp_tests(pud_aligned, prot);
+
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ /*
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
+ */
+
+ ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pte_unmap_unlock(ptep, ptl);
+
+ ptl = pmd_lock(mm, pmdp);
+ pmd_clear_tests(mm, pmdp);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(mm, pudp);
+ pud_clear_tests(mm, pudp);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ spin_unlock(ptl);
+
+ spin_lock(&mm->page_table_lock);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+ spin_unlock(&mm->page_table_lock);
+
+ p4d_free(mm, saved_p4dp);
+ pud_free(mm, saved_pudp);
+ pmd_free(mm, saved_pmdp);
+ pte_free(mm, saved_ptep);
+
+ vm_area_free(vma);
+ mm_dec_nr_puds(mm);
+ mm_dec_nr_pmds(mm);
+ mm_dec_nr_ptes(mm);
+ mmdrop(mm);
+ return 0;
+}
+late_initcall(debug_vm_pgtable);
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 000000000..a97c97232
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device. It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple. The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages. Each page in the page_list is split into blocks of at
+ * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page. Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
+struct dma_pool { /* the pool */
+ struct list_head page_list;
+ spinlock_t lock;
+ size_t size;
+ struct device *dev;
+ size_t allocation;
+ size_t boundary;
+ char name[32];
+ struct list_head pools;
+};
+
+struct dma_page { /* cacheable header for 'allocation' bytes */
+ struct list_head page_list;
+ void *vaddr;
+ dma_addr_t dma;
+ unsigned int in_use;
+ unsigned int offset;
+};
+
+static DEFINE_MUTEX(pools_lock);
+static DEFINE_MUTEX(pools_reg_lock);
+
+static ssize_t
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ unsigned temp;
+ unsigned size;
+ char *next;
+ struct dma_page *page;
+ struct dma_pool *pool;
+
+ next = buf;
+ size = PAGE_SIZE;
+
+ temp = scnprintf(next, size, "poolinfo - 0.1\n");
+ size -= temp;
+ next += temp;
+
+ mutex_lock(&pools_lock);
+ list_for_each_entry(pool, &dev->dma_pools, pools) {
+ unsigned pages = 0;
+ unsigned blocks = 0;
+
+ spin_lock_irq(&pool->lock);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ pages++;
+ blocks += page->in_use;
+ }
+ spin_unlock_irq(&pool->lock);
+
+ /* per-pool info, no real statistics yet */
+ temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
+ pool->name, blocks,
+ pages * (pool->allocation / pool->size),
+ pool->size, pages);
+ size -= temp;
+ next += temp;
+ }
+ mutex_unlock(&pools_lock);
+
+ return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(pools, 0444, show_pools, NULL);
+
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @boundary: returned blocks won't cross this power of two boundary
+ * Context: not in_interrupt()
+ *
+ * Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory. Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives. The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary. This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ *
+ * Return: a dma allocation pool with the requested characteristics, or
+ * %NULL if one can't be created.
+ */
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary)
+{
+ struct dma_pool *retval;
+ size_t allocation;
+ bool empty = false;
+
+ if (align == 0)
+ align = 1;
+ else if (align & (align - 1))
+ return NULL;
+
+ if (size == 0)
+ return NULL;
+ else if (size < 4)
+ size = 4;
+
+ size = ALIGN(size, align);
+ allocation = max_t(size_t, size, PAGE_SIZE);
+
+ if (!boundary)
+ boundary = allocation;
+ else if ((boundary < size) || (boundary & (boundary - 1)))
+ return NULL;
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ if (!retval)
+ return retval;
+
+ strlcpy(retval->name, name, sizeof(retval->name));
+
+ retval->dev = dev;
+
+ INIT_LIST_HEAD(&retval->page_list);
+ spin_lock_init(&retval->lock);
+ retval->size = size;
+ retval->boundary = boundary;
+ retval->allocation = allocation;
+
+ INIT_LIST_HEAD(&retval->pools);
+
+ /*
+ * pools_lock ensures that the ->dma_pools list does not get corrupted.
+ * pools_reg_lock ensures that there is not a race between
+ * dma_pool_create() and dma_pool_destroy() or within dma_pool_create()
+ * when the first invocation of dma_pool_create() failed on
+ * device_create_file() and the second assumes that it has been done (I
+ * know it is a short window).
+ */
+ mutex_lock(&pools_reg_lock);
+ mutex_lock(&pools_lock);
+ if (list_empty(&dev->dma_pools))
+ empty = true;
+ list_add(&retval->pools, &dev->dma_pools);
+ mutex_unlock(&pools_lock);
+ if (empty) {
+ int err;
+
+ err = device_create_file(dev, &dev_attr_pools);
+ if (err) {
+ mutex_lock(&pools_lock);
+ list_del(&retval->pools);
+ mutex_unlock(&pools_lock);
+ mutex_unlock(&pools_reg_lock);
+ kfree(retval);
+ return NULL;
+ }
+ }
+ mutex_unlock(&pools_reg_lock);
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_create);
+
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+ unsigned int offset = 0;
+ unsigned int next_boundary = pool->boundary;
+
+ do {
+ unsigned int next = offset + pool->size;
+ if (unlikely((next + pool->size) >= next_boundary)) {
+ next = next_boundary;
+ next_boundary += pool->boundary;
+ }
+ *(int *)(page->vaddr + offset) = next;
+ offset = next;
+ } while (offset < pool->allocation);
+}
+
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
+{
+ struct dma_page *page;
+
+ page = kmalloc(sizeof(*page), mem_flags);
+ if (!page)
+ return NULL;
+ page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
+ &page->dma, mem_flags);
+ if (page->vaddr) {
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ pool_initialise_page(pool, page);
+ page->in_use = 0;
+ page->offset = 0;
+ } else {
+ kfree(page);
+ page = NULL;
+ }
+ return page;
+}
+
+static inline bool is_page_busy(struct dma_page *page)
+{
+ return page->in_use != 0;
+}
+
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+{
+ dma_addr_t dma = page->dma;
+
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+ list_del(&page->page_list);
+ kfree(page);
+}
+
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void dma_pool_destroy(struct dma_pool *pool)
+{
+ struct dma_page *page, *tmp;
+ bool empty = false;
+
+ if (unlikely(!pool))
+ return;
+
+ mutex_lock(&pools_reg_lock);
+ mutex_lock(&pools_lock);
+ list_del(&pool->pools);
+ if (pool->dev && list_empty(&pool->dev->dma_pools))
+ empty = true;
+ mutex_unlock(&pools_lock);
+ if (empty)
+ device_remove_file(pool->dev, &dev_attr_pools);
+ mutex_unlock(&pools_reg_lock);
+
+ list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
+ if (is_page_busy(page)) {
+ if (pool->dev)
+ dev_err(pool->dev, "%s %s, %p busy\n", __func__,
+ pool->name, page->vaddr);
+ else
+ pr_err("%s %s, %p busy\n", __func__,
+ pool->name, page->vaddr);
+ /* leak the still-in-use consistent memory */
+ list_del(&page->page_list);
+ kfree(page);
+ } else
+ pool_free_page(pool, page);
+ }
+
+ kfree(pool);
+}
+EXPORT_SYMBOL(dma_pool_destroy);
+
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * Return: the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, %NULL is returned.
+ */
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ unsigned long flags;
+ struct dma_page *page;
+ size_t offset;
+ void *retval;
+
+ might_sleep_if(gfpflags_allow_blocking(mem_flags));
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (page->offset < pool->allocation)
+ goto ready;
+ }
+
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
+ if (!page)
+ return NULL;
+
+ spin_lock_irqsave(&pool->lock, flags);
+
+ list_add(&page->page_list, &pool->page_list);
+ ready:
+ page->in_use++;
+ offset = page->offset;
+ page->offset = *(int *)(page->vaddr + offset);
+ retval = offset + page->vaddr;
+ *handle = offset + page->dma;
+#ifdef DMAPOOL_DEBUG
+ {
+ int i;
+ u8 *data = retval;
+ /* page->offset is stored in first 4 bytes */
+ for (i = sizeof(page->offset); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ if (pool->dev)
+ dev_err(pool->dev, "%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
+ else
+ pr_err("%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+ }
+ if (!(mem_flags & __GFP_ZERO))
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (want_init_on_alloc(mem_flags))
+ memset(retval, 0, pool->size);
+
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_alloc);
+
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct dma_page *page;
+
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (dma < page->dma)
+ continue;
+ if ((dma - page->dma) < pool->allocation)
+ return page;
+ }
+ return NULL;
+}
+
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ struct dma_page *page;
+ unsigned long flags;
+ unsigned int offset;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ page = pool_find_page(pool, dma);
+ if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
+ else
+ pr_err("%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
+ return;
+ }
+
+ offset = vaddr - page->vaddr;
+ if (want_init_on_free())
+ memset(vaddr, 0, pool->size);
+#ifdef DMAPOOL_DEBUG
+ if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
+ else
+ pr_err("%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
+ return;
+ }
+ {
+ unsigned int chain = page->offset;
+ while (chain < pool->allocation) {
+ if (chain != offset) {
+ chain = *(int *)(page->vaddr + chain);
+ continue;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev, "%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
+ else
+ pr_err("%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
+ return;
+ }
+ }
+ memset(vaddr, POOL_POISON_FREED, pool->size);
+#endif
+
+ page->in_use--;
+ *(int *)vaddr = page->offset;
+ page->offset = offset;
+ /*
+ * Resist a temptation to do
+ * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * Better have a few empty pages hang around.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+EXPORT_SYMBOL(dma_pool_free);
+
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+ struct dma_pool *pool = *(struct dma_pool **)res;
+
+ dma_pool_destroy(pool);
+}
+
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+ return *(struct dma_pool **)res == match_data;
+}
+
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create(). DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ *
+ * Return: a managed dma allocation pool with the requested
+ * characteristics, or %NULL if one can't be created.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t allocation)
+{
+ struct dma_pool **ptr, *pool;
+
+ ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+ if (pool)
+ devres_add(dev, ptr);
+ else
+ devres_free(ptr);
+
+ return pool;
+}
+EXPORT_SYMBOL(dmam_pool_create);
+
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+ struct device *dev = pool->dev;
+
+ WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
+}
+EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000..a0018ad1a
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+#include <asm/early_ioremap.h>
+
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static int after_paging_init __initdata;
+
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ return prot;
+}
+
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+
+void __init early_ioremap_reset(void)
+{
+ early_ioremap_shutdown();
+ after_paging_init = 1;
+}
+
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t prot)
+{
+ BUG();
+}
+#endif
+
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+ BUG();
+}
+#endif
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_setup(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (WARN_ON(prev_map[i]))
+ break;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+
+static int __init check_early_ioremap_leak(void)
+{
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (WARN(count, KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n"
+ "please boot with early_ioremap_debug and report the dmesg.\n",
+ count))
+ return 1;
+ return 0;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+ unsigned long offset;
+ resource_size_t last_addr;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ WARN_ON(system_state >= SYSTEM_RUNNING);
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n",
+ __func__, &phys_addr, size))
+ return NULL;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (WARN_ON(!size || last_addr < phys_addr))
+ return NULL;
+
+ prev_size[slot] = size;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = offset_in_page(phys_addr);
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+ return NULL;
+
+ /*
+ * Ok, go for it..
+ */
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_set_fixmap(idx, phys_addr, prot);
+ else
+ __early_set_fixmap(idx, phys_addr, prot);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, &phys_addr, size, slot, offset, slot_virt[slot]);
+
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+ return prev_map[slot];
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size))
+ return;
+
+ if (WARN(prev_size[slot] != size,
+ "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]))
+ return;
+
+ WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+ addr, size, slot);
+
+ virt_addr = (unsigned long)addr;
+ if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+ return;
+
+ offset = offset_in_page(virt_addr);
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+ --idx;
+ --nrpages;
+ }
+ prev_map[slot] = NULL;
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#ifdef FIXMAP_PAGE_RO
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_RO);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#endif
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ __pgprot(prot_val));
+}
+#endif
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+ unsigned long slop, clen;
+ char *p;
+
+ while (size) {
+ slop = offset_in_page(src);
+ clen = size;
+ if (clen > MAX_MAP_CHUNK - slop)
+ clen = MAX_MAP_CHUNK - slop;
+ p = early_memremap(src & PAGE_MASK, clen + slop);
+ memcpy(dest, p + slop, clen);
+ early_memunmap(p, clen + slop);
+ dest += clen;
+ src += clen;
+ size -= clen;
+ }
+}
+
+#else /* CONFIG_MMU */
+
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void __iomem *)phys_addr;
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+
+#endif /* CONFIG_MMU */
+
+
+void __init early_memunmap(void *addr, unsigned long size)
+{
+ early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
new file mode 100644
index 000000000..d6baa4f45
--- /dev/null
+++ b/mm/fadvise.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/fadvise.c
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 11Jan2003 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/fadvise.h>
+#include <linux/writeback.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+
+#include <asm/unistd.h>
+
+#include "internal.h"
+
+/*
+ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
+ * deactivate the pages and clear PG_Referenced.
+ */
+
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct inode *inode;
+ struct address_space *mapping;
+ struct backing_dev_info *bdi;
+ loff_t endbyte; /* inclusive */
+ pgoff_t start_index;
+ pgoff_t end_index;
+ unsigned long nrpages;
+
+ inode = file_inode(file);
+ if (S_ISFIFO(inode->i_mode))
+ return -ESPIPE;
+
+ mapping = file->f_mapping;
+ if (!mapping || len < 0)
+ return -EINVAL;
+
+ bdi = inode_to_bdi(mapping->host);
+
+ if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_NOREUSE:
+ case POSIX_FADV_DONTNEED:
+ /* no bad return value, but ignore advice */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ /*
+ * Careful about overflows. Len == 0 means "as much as possible". Use
+ * unsigned math because signed overflows are undefined and UBSan
+ * complains.
+ */
+ endbyte = (u64)offset + (u64)len;
+ if (!len || endbyte < len)
+ endbyte = -1;
+ else
+ endbyte--; /* inclusive */
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
+ break;
+ case POSIX_FADV_RANDOM:
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
+ break;
+ case POSIX_FADV_SEQUENTIAL:
+ file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&file->f_lock);
+ file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&file->f_lock);
+ break;
+ case POSIX_FADV_WILLNEED:
+ /* First and last PARTIAL page! */
+ start_index = offset >> PAGE_SHIFT;
+ end_index = endbyte >> PAGE_SHIFT;
+
+ /* Careful about overflow on the "+1" */
+ nrpages = end_index - start_index + 1;
+ if (!nrpages)
+ nrpages = ~0UL;
+
+ force_page_cache_readahead(mapping, file, start_index, nrpages);
+ break;
+ case POSIX_FADV_NOREUSE:
+ break;
+ case POSIX_FADV_DONTNEED:
+ if (!inode_write_congested(mapping->host))
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
+
+ /*
+ * First and last FULL page! Partial pages are deliberately
+ * preserved on the expectation that it is better to preserve
+ * needed memory than to discard unneeded memory.
+ */
+ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
+ end_index = (endbyte >> PAGE_SHIFT);
+ /*
+ * The page at end_index will be inclusively discarded according
+ * by invalidate_mapping_pages(), so subtracting 1 from
+ * end_index means we will skip the last page. But if endbyte
+ * is page aligned or is at the end of file, we should not skip
+ * that page - discarding the last page is safe enough.
+ */
+ if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
+ endbyte != inode->i_size - 1) {
+ /* First page is tricky as 0 - 1 = -1, but pgoff_t
+ * is unsigned, so the end_index >= start_index
+ * check below would be true and we'll discard the whole
+ * file cache which is not what was asked.
+ */
+ if (end_index == 0)
+ break;
+
+ end_index--;
+ }
+
+ if (end_index >= start_index) {
+ unsigned long nr_pagevec = 0;
+
+ /*
+ * It's common to FADV_DONTNEED right after
+ * the read or write that instantiates the
+ * pages, in which case there will be some
+ * sitting on the local LRU cache. Try to
+ * avoid the expensive remote drain and the
+ * second cache tree walk below by flushing
+ * them out right away.
+ */
+ lru_add_drain();
+
+ invalidate_mapping_pagevec(mapping,
+ start_index, end_index,
+ &nr_pagevec);
+
+ /*
+ * If fewer pages were invalidated than expected then
+ * it is possible that some of the pages were on
+ * a per-cpu pagevec for a remote CPU. Drain all
+ * pagevecs and try again.
+ */
+ if (nr_pagevec) {
+ lru_add_drain_all();
+ invalidate_mapping_pages(mapping, start_index,
+ end_index);
+ }
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(generic_fadvise);
+
+int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ if (file->f_op->fadvise)
+ return file->f_op->fadvise(file, offset, len, advice);
+
+ return generic_fadvise(file, offset, len, advice);
+}
+EXPORT_SYMBOL(vfs_fadvise);
+
+#ifdef CONFIG_ADVISE_SYSCALLS
+
+int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+{
+ struct fd f = fdget(fd);
+ int ret;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_fadvise(f.file, offset, len, advice);
+
+ fdput(f);
+ return ret;
+}
+
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+{
+ return ksys_fadvise64_64(fd, offset, len, advice);
+}
+
+#ifdef __ARCH_WANT_SYS_FADVISE64
+
+SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
+{
+ return ksys_fadvise64_64(fd, offset, len, advice);
+}
+
+#endif
+#endif
diff --git a/mm/failslab.c b/mm/failslab.c
new file mode 100644
index 000000000..f92fed91a
--- /dev/null
+++ b/mm/failslab.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
+
+static struct {
+ struct fault_attr attr;
+ bool ignore_gfp_reclaim;
+ bool cache_filter;
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .cache_filter = false,
+};
+
+bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
+{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
+ if (gfpflags & __GFP_NOFAIL)
+ return false;
+
+ if (failslab.ignore_gfp_reclaim &&
+ (gfpflags & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
+ return false;
+
+ return should_fail(&failslab.attr, s->object_size);
+}
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init failslab_debugfs_init(void)
+{
+ struct dentry *dir;
+ umode_t mode = S_IFREG | 0600;
+
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_reclaim);
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
+
+ return 0;
+}
+
+late_initcall(failslab_debugfs_init);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
new file mode 100644
index 000000000..3b0d8c6dd
--- /dev/null
+++ b/mm/filemap.c
@@ -0,0 +1,3536 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/export.h>
+#include <linux/compiler.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/error-injection.h>
+#include <linux/hash.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/cpuset.h>
+#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/cleancache.h>
+#include <linux/shmem_fs.h>
+#include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
+#include <linux/ramfs.h>
+#include <linux/page_idle.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/filemap.h>
+
+/*
+ * FIXME: remove all knowledge of the buffer layer from the core VM
+ */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
+
+#include <asm/mman.h>
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995 Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+/*
+ * Lock ordering:
+ *
+ * ->i_mmap_rwsem (truncate_pagecache)
+ * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->swap_lock (exclusive_swap_page, others)
+ * ->i_pages lock
+ *
+ * ->i_mutex
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
+ *
+ * ->mmap_lock
+ * ->i_mmap_rwsem
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
+ * ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
+ *
+ * ->mmap_lock
+ * ->lock_page (access_process_vm)
+ *
+ * ->i_mutex (generic_perform_write)
+ * ->mmap_lock (fault_in_pages_readable->do_page_fault)
+ *
+ * bdi->wb.list_lock
+ * sb_lock (fs/fs-writeback.c)
+ * ->i_pages lock (__sync_single_inode)
+ *
+ * ->i_mmap_rwsem
+ * ->anon_vma.lock (vma_adjust)
+ *
+ * ->anon_vma.lock
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
+ *
+ * ->page_table_lock or pte_lock
+ * ->swap_lock (try_to_unmap_one)
+ * ->private_lock (try_to_unmap_one)
+ * ->i_pages lock (try_to_unmap_one)
+ * ->pgdat->lru_lock (follow_page->mark_page_accessed)
+ * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
+ * ->private_lock (page_remove_rmap->set_page_dirty)
+ * ->i_pages lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
+ * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ *
+ * ->i_mmap_rwsem
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
+ */
+
+static void page_cache_delete(struct address_space *mapping,
+ struct page *page, void *shadow)
+{
+ XA_STATE(xas, &mapping->i_pages, page->index);
+ unsigned int nr = 1;
+
+ mapping_set_update(&xas, mapping);
+
+ /* hugetlb pages are represented by a single entry in the xarray */
+ if (!PageHuge(page)) {
+ xas_set_order(&xas, page->index, compound_order(page));
+ nr = compound_nr(page);
+ }
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(nr != 1 && shadow, page);
+
+ xas_store(&xas, shadow);
+ xas_init_marks(&xas);
+
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
+
+ if (shadow) {
+ mapping->nrexceptional += nr;
+ /*
+ * Make sure the nrexceptional update is committed before
+ * the nrpages update so that final truncate racing
+ * with reclaim does not see both counters 0 at the
+ * same time and miss a shadow entry.
+ */
+ smp_wmb();
+ }
+ mapping->nrpages -= nr;
+}
+
+static void unaccount_page_cache_page(struct address_space *mapping,
+ struct page *page)
+{
+ int nr;
+
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_invalidate_page(mapping, page);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+ int mapcount;
+
+ pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ dump_page(page, "still mapped when deleted");
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ mapcount = page_mapcount(page);
+ if (mapping_exiting(mapping) &&
+ page_count(page) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped,
+ * and we'd prefer not to leak it: if we're wrong,
+ * some other bad page check should catch it later.
+ */
+ page_mapcount_reset(page);
+ page_ref_sub(page, mapcount);
+ }
+ }
+
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (PageHuge(page))
+ return;
+
+ nr = thp_nr_pages(page);
+
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_page_state(page, NR_SHMEM, -nr);
+ if (PageTransHuge(page))
+ __dec_node_page_state(page, NR_SHMEM_THPS);
+ } else if (PageTransHuge(page)) {
+ __dec_node_page_state(page, NR_FILE_THPS);
+ filemap_nr_thps_dec(mapping);
+ }
+
+ /*
+ * At this point page must be either written or cleaned by
+ * truncate. Dirty page here signals a bug and loss of
+ * unwritten data.
+ *
+ * This fixes dirty accounting after removing the page entirely
+ * but leaves PageDirty set: it has no effect for truncated
+ * page and anyway will be cleared before returning page into
+ * buddy allocator.
+ */
+ if (WARN_ON_ONCE(PageDirty(page)))
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
+}
+
+/*
+ * Delete a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the i_pages lock.
+ */
+void __delete_from_page_cache(struct page *page, void *shadow)
+{
+ struct address_space *mapping = page->mapping;
+
+ trace_mm_filemap_delete_from_page_cache(page);
+
+ unaccount_page_cache_page(mapping, page);
+ page_cache_delete(mapping, page, shadow);
+}
+
+static void page_cache_free_page(struct address_space *mapping,
+ struct page *page)
+{
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+ if (freepage)
+ freepage(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, thp_nr_pages(page));
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
+}
+
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ unsigned long flags;
+
+ BUG_ON(!PageLocked(page));
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ __delete_from_page_cache(page, NULL);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+ page_cache_free_page(mapping, page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
+
+/*
+ * page_cache_delete_batch - delete several pages from page cache
+ * @mapping: the mapping to which pages belong
+ * @pvec: pagevec with pages to delete
+ *
+ * The function walks over mapping->i_pages and removes pages passed in @pvec
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
+ * It tolerates holes in @pvec (mapping entries at those indices are not
+ * modified). The function expects only THP head pages to be present in the
+ * @pvec.
+ *
+ * The function expects the i_pages lock to be held.
+ */
+static void page_cache_delete_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
+ int total_pages = 0;
+ int i = 0;
+ struct page *page;
+
+ mapping_set_update(&xas, mapping);
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (i >= pagevec_count(pvec))
+ break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
+ if (xa_is_value(page))
+ continue;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
+ i++;
+ xas_store(&xas, NULL);
+ total_pages++;
+ }
+ mapping->nrpages -= total_pages;
+}
+
+void delete_from_page_cache_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ int i;
+ unsigned long flags;
+
+ if (!pagevec_count(pvec))
+ return;
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+
+ unaccount_page_cache_page(mapping, pvec->pages[i]);
+ }
+ page_cache_delete_batch(mapping, pvec);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+ for (i = 0; i < pagevec_count(pvec); i++)
+ page_cache_free_page(mapping, pvec->pages[i]);
+}
+
+int filemap_check_errors(struct address_space *mapping)
+{
+ int ret = 0;
+ /* Check for outstanding write errors */
+ if (test_bit(AS_ENOSPC, &mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+ return ret;
+}
+EXPORT_SYMBOL(filemap_check_errors);
+
+static int filemap_check_and_keep_errors(struct address_space *mapping)
+{
+ /* Check for outstanding write errors */
+ if (test_bit(AS_EIO, &mapping->flags))
+ return -EIO;
+ if (test_bit(AS_ENOSPC, &mapping->flags))
+ return -ENOSPC;
+ return 0;
+}
+
+/**
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end: offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback. The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
+{
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+
+ if (!mapping_can_writeback(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+ ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
+ return ret;
+}
+
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+ int sync_mode)
+{
+ return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
+}
+
+int filemap_fdatawrite(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite);
+
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
+
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping: target address_space
+ *
+ * This is a mostly non-blocking flush. Not suitable for data-integrity
+ * purposes - I/O may not be started against all dirty pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_flush(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL(filemap_flush);
+
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ *
+ * Return: %true if at least one page exists in the specified range,
+ * %false otherwise.
+ */
+bool filemap_range_has_page(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ struct page *page;
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ for (;;) {
+ page = xas_find(&xas, max);
+ if (xas_retry(&xas, page))
+ continue;
+ /* Shadow entries don't count */
+ if (xa_is_value(page))
+ continue;
+ /*
+ * We don't need to try to pin this page; we're about to
+ * release the RCU lock anyway. It is enough to know that
+ * there was a page here recently.
+ */
+ break;
+ }
+ rcu_read_unlock();
+
+ return page != NULL;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
+static void __filemap_fdatawait_range(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
+ struct pagevec pvec;
+ int nr_pages;
+
+ if (end_byte < start_byte)
+ return;
+
+ pagevec_init(&pvec);
+ while (index <= end) {
+ unsigned i;
+
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ wait_on_page_writeback(page);
+ ClearPageError(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them. Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ *
+ * Return: error status of the address space.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
+{
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range);
+
+/**
+ * filemap_fdatawait_range_keep_errors - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space in the
+ * given range and wait for all of them. Unlike filemap_fdatawait_range(),
+ * this function does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
+
+/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file: file pointing to address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them. Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ *
+ * Return: error status of the address space vs. the file->f_wb_err cursor.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
+/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them. Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ *
+ * Return: error status of the address space.
+ */
+int filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+ __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+ return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+
+/* Returns true if writeback might be needed or already in progress. */
+static bool mapping_needs_writeback(struct address_space *mapping)
+{
+ if (dax_mapping(mapping))
+ return mapping->nrexceptional;
+
+ return mapping->nrpages;
+}
+
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping: the address_space for the pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that @lend is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * Return: error status of the address space.
+ */
+int filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int err = 0;
+
+ if (mapping_needs_writeback(mapping)) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait_range(mapping,
+ lstart, lend);
+ if (!err)
+ err = err2;
+ } else {
+ /* Clear any previously stored errors */
+ filemap_check_errors(mapping);
+ }
+ } else {
+ err = filemap_check_errors(mapping);
+ }
+ return err;
+}
+EXPORT_SYMBOL(filemap_write_and_wait_range);
+
+void __filemap_set_wb_err(struct address_space *mapping, int err)
+{
+ errseq_t eseq = errseq_set(&mapping->wb_err, err);
+
+ trace_filemap_set_wb_err(mapping, eseq);
+}
+EXPORT_SYMBOL(__filemap_set_wb_err);
+
+/**
+ * file_check_and_advance_wb_err - report wb error (if any) that was previously
+ * and advance wb_err to current one
+ * @file: struct file on which the error is being reported
+ *
+ * When userland calls fsync (or something like nfsd does the equivalent), we
+ * want to report any writeback errors that occurred since the last fsync (or
+ * since the file was opened if there haven't been any).
+ *
+ * Grab the wb_err from the mapping. If it matches what we have in the file,
+ * then just quickly return 0. The file is all caught up.
+ *
+ * If it doesn't match, then take the mapping value, set the "seen" flag in
+ * it and try to swap it into place. If it works, or another task beat us
+ * to it with the new value, then update the f_wb_err and return the error
+ * portion. The error at this point must be reported via proper channels
+ * (a'la fsync, or NFS COMMIT operation, etc.).
+ *
+ * While we handle mapping->wb_err with atomic operations, the f_wb_err
+ * value is protected by the f_lock since we must ensure that it reflects
+ * the latest value swapped in for this file descriptor.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int file_check_and_advance_wb_err(struct file *file)
+{
+ int err = 0;
+ errseq_t old = READ_ONCE(file->f_wb_err);
+ struct address_space *mapping = file->f_mapping;
+
+ /* Locklessly handle the common case where nothing has changed */
+ if (errseq_check(&mapping->wb_err, old)) {
+ /* Something changed, must use slow path */
+ spin_lock(&file->f_lock);
+ old = file->f_wb_err;
+ err = errseq_check_and_advance(&mapping->wb_err,
+ &file->f_wb_err);
+ trace_file_check_and_advance_wb_err(file, old);
+ spin_unlock(&file->f_lock);
+ }
+
+ /*
+ * We're mostly using this function as a drop in replacement for
+ * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
+ * that the legacy code would have had on these flags.
+ */
+ clear_bit(AS_EIO, &mapping->flags);
+ clear_bit(AS_ENOSPC, &mapping->flags);
+ return err;
+}
+EXPORT_SYMBOL(file_check_and_advance_wb_err);
+
+/**
+ * file_write_and_wait_range - write out & wait on a file range
+ * @file: file pointing to address_space with pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that @lend is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * After writing out and waiting on the data, we check and advance the
+ * f_wb_err cursor to the latest value, and return any errors detected there.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
+{
+ int err = 0, err2;
+ struct address_space *mapping = file->f_mapping;
+
+ if (mapping_needs_writeback(mapping)) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
+ }
+ err2 = file_check_and_advance_wb_err(file);
+ if (!err)
+ err = err2;
+ return err;
+}
+EXPORT_SYMBOL(file_write_and_wait_range);
+
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. This function cannot fail.
+ *
+ * Return: %0
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ pgoff_t offset = old->index;
+ XA_STATE(xas, &mapping->i_pages, offset);
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageLocked(old), old);
+ VM_BUG_ON_PAGE(!PageLocked(new), new);
+ VM_BUG_ON_PAGE(new->mapping, new);
+
+ get_page(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ mem_cgroup_migrate(old, new);
+
+ xas_lock_irqsave(&xas, flags);
+ xas_store(&xas, new);
+
+ old->mapping = NULL;
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(old))
+ __dec_lruvec_page_state(old, NR_FILE_PAGES);
+ if (!PageHuge(new))
+ __inc_lruvec_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(old))
+ __dec_lruvec_page_state(old, NR_SHMEM);
+ if (PageSwapBacked(new))
+ __inc_lruvec_page_state(new, NR_SHMEM);
+ xas_unlock_irqrestore(&xas, flags);
+ if (freepage)
+ freepage(old);
+ put_page(old);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp,
+ void **shadowp)
+{
+ XA_STATE(xas, &mapping->i_pages, offset);
+ int huge = PageHuge(page);
+ int error;
+ bool charged = false;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ mapping_set_update(&xas, mapping);
+
+ get_page(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ if (!huge) {
+ error = mem_cgroup_charge(page, current->mm, gfp);
+ if (error)
+ goto error;
+ charged = true;
+ }
+
+ gfp &= GFP_RECLAIM_MASK;
+
+ do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
+ xas_lock_irq(&xas);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
+
+ if (old)
+ mapping->nrexceptional--;
+ mapping->nrpages++;
+
+ /* hugetlb pages do not participate in page cache accounting */
+ if (!huge)
+ __inc_lruvec_page_state(page, NR_FILE_PAGES);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
+ if (charged)
+ mem_cgroup_uncharge(page);
+ goto error;
+ }
+
+ trace_mm_filemap_add_to_page_cache(page);
+ return 0;
+error:
+ page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
+ put_page(page);
+ return error;
+}
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+
+/**
+ * add_to_page_cache_locked - add a locked page to the pagecache
+ * @page: page to add
+ * @mapping: the page's address_space
+ * @offset: page index
+ * @gfp_mask: page allocation mode
+ *
+ * This function is used to add a page to the pagecache. It must be locked.
+ * This function does not add the page to the LRU. The caller must do that.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ return __add_to_page_cache_locked(page, mapping, offset,
+ gfp_mask, NULL);
+}
+EXPORT_SYMBOL(add_to_page_cache_locked);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ void *shadow = NULL;
+ int ret;
+
+ __SetPageLocked(page);
+ ret = __add_to_page_cache_locked(page, mapping, offset,
+ gfp_mask, &shadow);
+ if (unlikely(ret))
+ __ClearPageLocked(page);
+ else {
+ /*
+ * The page might have been evicted from cache only
+ * recently, in which case it should be activated like
+ * any other repeatedly accessed page.
+ * The exception is pages getting rewritten; evicting other
+ * data from the working set, only to cache data that will
+ * get overwritten with something else, is a waste of memory.
+ */
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
+ lru_cache_add(page);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+
+#ifdef CONFIG_NUMA
+struct page *__page_cache_alloc(gfp_t gfp)
+{
+ int n;
+ struct page *page;
+
+ if (cpuset_do_page_mem_spread()) {
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ n = cpuset_mem_spread_node();
+ page = __alloc_pages_node(n, gfp, 0);
+ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return page;
+ }
+ return alloc_pages(gfp, 0);
+}
+EXPORT_SYMBOL(__page_cache_alloc);
+#endif
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
+{
+ return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+}
+
+void __init pagecache_init(void)
+{
+ int i;
+
+ for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(&page_wait_table[i]);
+
+ page_writeback_init();
+}
+
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just the usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ * (a) no special bits set:
+ *
+ * We're just waiting for the bit to be released, and when a waker
+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ * and remove it from the wait queue.
+ *
+ * Simple and straightforward.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE:
+ *
+ * The waiter is waiting to get the lock, and only one waiter should
+ * be woken up to avoid any thundering herd behavior. We'll set the
+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ * This is the traditional exclusive wait.
+ *
+ * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ * The waiter is waiting to get the bit, and additionally wants the
+ * lock to be transferred to it for fair lock behavior. If the lock
+ * cannot be taken, we stop walking the wait queue without waking
+ * the waiter.
+ *
+ * This is the "fair lock handoff" case, and in addition to setting
+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ * that it now has the lock.
+ */
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+{
+ unsigned int flags;
+ struct wait_page_key *key = arg;
+ struct wait_page_queue *wait_page
+ = container_of(wait, struct wait_page_queue, wait);
+
+ if (!wake_page_match(wait_page, key))
+ return 0;
+
+ /*
+ * If it's a lock handoff wait, we get the bit for it, and
+ * stop walking (and do not wake it up) if we can't.
+ */
+ flags = wait->flags;
+ if (flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ if (flags & WQ_FLAG_CUSTOM) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ flags |= WQ_FLAG_DONE;
+ }
+ }
+
+ /*
+ * We are holding the wait-queue lock, but the waiter that
+ * is waiting for this will be checking the flags without
+ * any locking.
+ *
+ * So update the flags atomically, and wake up the waiter
+ * afterwards to avoid any races. This store-release pairs
+ * with the load-acquire in wait_on_page_bit_common().
+ */
+ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
+ wake_up_state(wait->private, mode);
+
+ /*
+ * Ok, we have successfully done what we're waiting for,
+ * and we can unconditionally remove the wait entry.
+ *
+ * Note that this pairs with the "finish_wait()" in the
+ * waiter, and has to be the absolute last thing we do.
+ * After this list_del_init(&wait->entry) the wait entry
+ * might be de-allocated and the process might even have
+ * exited.
+ */
+ list_del_init_careful(&wait->entry);
+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
+}
+
+static void wake_up_page_bit(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ struct wait_page_key key;
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ key.page = page;
+ key.bit_nr = bit_nr;
+ key.page_match = 0;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ /*
+ * Take a breather from holding the lock,
+ * allow pages that finish wake up asynchronously
+ * to acquire the lock and remove themselves
+ * from wait queue
+ */
+ spin_unlock_irqrestore(&q->lock, flags);
+ cpu_relax();
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ }
+
+ /*
+ * It is possible for other pages to have collided on the waitqueue
+ * hash, so in that case check for a page match. That prevents a long-
+ * term waiter
+ *
+ * It is still possible to miss a case here, when we woke page waiters
+ * and removed them from the waitqueue, but there are still other
+ * page waiters.
+ */
+ if (!waitqueue_active(q) || !key.page_match) {
+ ClearPageWaiters(page);
+ /*
+ * It's possible to miss clearing Waiters here, when we woke
+ * our page waiters, but the hashed waitqueue has waiters for
+ * other pages on it.
+ *
+ * That's okay, it's a rare case. The next waker will clear it.
+ */
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+
+static void wake_up_page(struct page *page, int bit)
+{
+ if (!PageWaiters(page))
+ return;
+ wake_up_page_bit(page, bit);
+}
+
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+ EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
+ * __lock_page() waiting on then setting PG_locked.
+ */
+ SHARED, /* Hold ref to page and check the bit when woken, like
+ * wait_on_page_writeback() waiting on PG_writeback.
+ */
+ DROP, /* Drop ref to page before wait, no check when woken,
+ * like put_and_wait_on_page_locked() on PG_locked.
+ */
+};
+
+/*
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
+ */
+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+ struct wait_queue_entry *wait)
+{
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(bit_nr, &page->flags))
+ return false;
+ } else if (test_bit(bit_nr, &page->flags))
+ return false;
+
+ wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
+ return true;
+}
+
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ struct page *page, int bit_nr, int state, enum behavior behavior)
+{
+ int unfairness = sysctl_page_lock_unfairness;
+ struct wait_page_queue wait_page;
+ wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ bool delayacct = false;
+ unsigned long pflags;
+
+ if (bit_nr == PG_locked &&
+ !PageUptodate(page) && PageWorkingset(page)) {
+ if (!PageSwapBacked(page)) {
+ delayacct_thrashing_start();
+ delayacct = true;
+ }
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.page = page;
+ wait_page.bit_nr = bit_nr;
+
+repeat:
+ wait->flags = 0;
+ if (behavior == EXCLUSIVE) {
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ if (--unfairness < 0)
+ wait->flags |= WQ_FLAG_CUSTOM;
+ }
+
+ /*
+ * Do one last check whether we can get the
+ * page bit synchronously.
+ *
+ * Do the SetPageWaiters() marking before that
+ * to let any waker we _just_ missed know they
+ * need to wake us up (otherwise they'll never
+ * even go to the slow case that looks at the
+ * page queue), and add ourselves to the wait
+ * queue if we need to sleep.
+ *
+ * This part needs to be done under the queue
+ * lock to avoid races.
+ */
+ spin_lock_irq(&q->lock);
+ SetPageWaiters(page);
+ if (!trylock_page_bit_common(page, bit_nr, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
+
+ /*
+ * From now on, all the logic will be based on
+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ * see whether the page bit testing has already
+ * been done by the wake function.
+ *
+ * We can drop our reference to the page.
+ */
+ if (behavior == DROP)
+ put_page(page);
+
+ /*
+ * Note that until the "finish_wait()", or until
+ * we see the WQ_FLAG_WOKEN flag, we need to
+ * be very careful with the 'wait->flags', because
+ * we may race with a waker that sets them.
+ */
+ for (;;) {
+ unsigned int flags;
+
+ set_current_state(state);
+
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(state, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+
+ /* If we were non-exclusive, we're done */
+ if (behavior != EXCLUSIVE)
+ break;
+
+ /* If the waker got the lock for us, we're done */
+ if (flags & WQ_FLAG_DONE)
+ break;
+
+ /*
+ * Otherwise, if we're getting the lock, we need to
+ * try to get it ourselves.
+ *
+ * And if that fails, we'll have to retry this all.
+ */
+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ goto repeat;
+
+ wait->flags |= WQ_FLAG_DONE;
+ break;
+ }
+
+ /*
+ * If a signal happened, this 'finish_wait()' may remove the last
+ * waiter from the wait-queues, but the PageWaiters bit will remain
+ * set. That's ok. The next wakeup will take care of it, and trying
+ * to do it here would be difficult and prone to races.
+ */
+ finish_wait(q, wait);
+
+ if (thrashing) {
+ if (delayacct)
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+
+ /*
+ * NOTE! The wait->flags weren't stable until we've done the
+ * 'finish_wait()', and we could have exited the loop above due
+ * to a signal, and had a wakeup event happen after the signal
+ * test but before the 'finish_wait()'.
+ *
+ * So only after the finish_wait() can we reliably determine
+ * if we got woken up or not, so we can now figure out the final
+ * return value based on that state without races.
+ *
+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
+ */
+ if (behavior == EXCLUSIVE)
+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
+
+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
+}
+EXPORT_SYMBOL(wait_on_page_bit_killable);
+
+static int __wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait, bool set)
+{
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ if (set)
+ ret = !trylock_page(page);
+ else
+ ret = PageLocked(page);
+ /*
+ * If we were succesful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
+}
+
+static int wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait)
+{
+ if (!PageLocked(page))
+ return 0;
+ return __wait_on_page_locked_async(compound_head(page), wait, false);
+}
+
+/**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page. They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked. After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+ wait_queue_head_t *q;
+
+ page = compound_head(page);
+ q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page: Page defining the wait queue of interest
+ * @waiter: Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue_entry_tail(q, waiter);
+ SetPageWaiters(page);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
+#ifndef clear_bit_unlock_is_negative_byte
+
+/*
+ * PG_waiters is the high bit in the same byte as PG_lock.
+ *
+ * On x86 (and on many other architectures), we can clear PG_lock and
+ * test the sign bit at the same time. But if the architecture does
+ * not support that special operation, we just do this all by hand
+ * instead.
+ *
+ * The read of PG_waiters has to be after (or concurrently with) PG_locked
+ * being cleared, but a memory barrier should be unnecessary since it is
+ * in the same byte as PG_locked.
+ */
+static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
+{
+ clear_bit_unlock(nr, mem);
+ /* smp_mb__after_atomic(); */
+ return test_bit(PG_waiters, mem);
+}
+
+#endif
+
+/**
+ * unlock_page - unlock a locked page
+ * @page: the page
+ *
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
+ * Also wakes sleepers in wait_on_page_writeback() because the wakeup
+ * mechanism between PageLocked pages and PageWriteback pages is shared.
+ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ *
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
+ */
+void unlock_page(struct page *page)
+{
+ BUILD_BUG_ON(PG_waiters != 7);
+ page = compound_head(page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+ wake_up_page_bit(page, PG_locked);
+}
+EXPORT_SYMBOL(unlock_page);
+
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
+ */
+void end_page_writeback(struct page *page)
+{
+ /*
+ * TestClearPageReclaim could be used here but it is an atomic
+ * operation and overkill in this particular case. Failing to
+ * shuffle a page marked for immediate reclaim is too mild to
+ * justify taking an atomic operation penalty at the end of
+ * ever page writeback.
+ */
+ if (PageReclaim(page)) {
+ ClearPageReclaim(page);
+ rotate_reclaimable_page(page);
+ }
+
+ /*
+ * Writeback does not hold a page reference of its own, relying
+ * on truncation to wait for the clearing of PG_writeback.
+ * But here we must make sure that the page is not freed and
+ * reused before the wake_up_page().
+ */
+ get_page(page);
+ if (!test_clear_page_writeback(page))
+ BUG();
+
+ smp_mb__after_atomic();
+ wake_up_page(page, PG_writeback);
+ put_page(page);
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+/*
+ * After completing I/O on a page, call this routine to update the page
+ * flags appropriately
+ */
+void page_endio(struct page *page, bool is_write, int err)
+{
+ if (!is_write) {
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else {
+ if (err) {
+ struct address_space *mapping;
+
+ SetPageError(page);
+ mapping = page_mapping(page);
+ if (mapping)
+ mapping_set_error(mapping, err);
+ }
+ end_page_writeback(page);
+ }
+}
+EXPORT_SYMBOL_GPL(page_endio);
+
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @__page: the page to lock
+ */
+void __lock_page(struct page *__page)
+{
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ EXCLUSIVE);
+}
+EXPORT_SYMBOL(__lock_page);
+
+int __lock_page_killable(struct page *__page)
+{
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ EXCLUSIVE);
+}
+EXPORT_SYMBOL_GPL(__lock_page_killable);
+
+int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+{
+ return __wait_on_page_locked_async(page, wait, true);
+}
+
+/*
+ * Return values:
+ * 1 - page is locked; mmap_lock is still held.
+ * 0 - page is not locked.
+ * mmap_lock has been released (mmap_read_unlock(), unless flags had both
+ * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
+ * which case mmap_lock is still held.
+ *
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
+ * with the page locked and the mmap_lock unperturbed.
+ */
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (fault_flag_allow_retry_first(flags)) {
+ /*
+ * CAUTION! In this case, mmap_lock is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ mmap_read_unlock(mm);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
+ wait_on_page_locked(page);
+ return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+ }
+}
+
+/**
+ * page_cache_next_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_miss covering both indices may
+ * return 10 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
+ */
+pgoff_t page_cache_next_miss(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ while (max_scan--) {
+ void *entry = xas_next(&xas);
+ if (!entry || xa_is_value(entry))
+ break;
+ if (xas.xa_index == 0)
+ break;
+ }
+
+ return xas.xa_index;
+}
+EXPORT_SYMBOL(page_cache_next_miss);
+
+/**
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [max(index - max_scan + 1, 0), index] for the
+ * gap with the highest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 10, then subsequently a gap is
+ * created at index 5, page_cache_prev_miss() covering both indices may
+ * return 5 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'index - return >= max_scan' will be true).
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
+ */
+pgoff_t page_cache_prev_miss(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ while (max_scan--) {
+ void *entry = xas_prev(&xas);
+ if (!entry || xa_is_value(entry))
+ break;
+ if (xas.xa_index == ULONG_MAX)
+ break;
+ }
+
+ return xas.xa_index;
+}
+EXPORT_SYMBOL(page_cache_prev_miss);
+
+/**
+ * find_get_entry - find and get a page cache entry
+ * @mapping: the address_space to search
+ * @index: The page cache index.
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, the head page is returned with an increased refcount.
+ *
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
+ *
+ * Return: The head page or shadow entry, %NULL if nothing is found.
+ */
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *page;
+
+ rcu_read_lock();
+repeat:
+ xas_reset(&xas);
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page))
+ goto repeat;
+ /*
+ * A shadow entry of a recently evicted page, or a swap entry from
+ * shmem/tmpfs. Return it without attempting to raise page count.
+ */
+ if (!page || xa_is_value(page))
+ goto out;
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /*
+ * Has the page moved or been split?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(page);
+ goto repeat;
+ }
+out:
+ rcu_read_unlock();
+
+ return page;
+}
+
+/**
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
+ *
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
+ *
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
+ *
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
+ */
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page;
+
+repeat:
+ page = find_get_entry(mapping, index);
+ if (page && !xa_is_value(page)) {
+ lock_page(page);
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto repeat;
+ }
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+ }
+ return page;
+}
+
+/**
+ * pagecache_get_page - Find and get a reference to a page.
+ * @mapping: The address_space to search.
+ * @index: The page index.
+ * @fgp_flags: %FGP flags modify how the page is returned.
+ * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ *
+ * Looks up the page cache entry at @mapping & @index.
+ *
+ * @fgp_flags can be zero or more of these flags:
+ *
+ * * %FGP_ACCESSED - The page will be marked accessed.
+ * * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
+ * * %FGP_CREAT - If no page is present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU list.
+ * The page is returned locked and with an increased refcount.
+ * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
+ * page is already in cache. If the page was allocated, unlock it before
+ * returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
+ * * %FGP_NOWAIT - Don't get blocked by page lock
+ *
+ * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
+ * if the %GFP flags specified for %FGP_CREAT are atomic.
+ *
+ * If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: The found page or %NULL otherwise.
+ */
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp_mask)
+{
+ struct page *page;
+
+repeat:
+ page = find_get_entry(mapping, index);
+ if (xa_is_value(page))
+ page = NULL;
+ if (!page)
+ goto no_page;
+
+ if (fgp_flags & FGP_LOCK) {
+ if (fgp_flags & FGP_NOWAIT) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return NULL;
+ }
+ } else {
+ lock_page(page);
+ }
+
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto repeat;
+ }
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+ }
+
+ if (fgp_flags & FGP_ACCESSED)
+ mark_page_accessed(page);
+ else if (fgp_flags & FGP_WRITE) {
+ /* Clear idle flag for buffer write */
+ if (page_is_idle(page))
+ clear_page_idle(page);
+ }
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
+
+no_page:
+ if (!page && (fgp_flags & FGP_CREAT)) {
+ int err;
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
+ gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS)
+ gfp_mask &= ~__GFP_FS;
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return NULL;
+
+ if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
+ fgp_flags |= FGP_LOCK;
+
+ /* Init accessed so avoid atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ __SetPageReferenced(page);
+
+ err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ if (unlikely(err)) {
+ put_page(page);
+ page = NULL;
+ if (err == -EEXIST)
+ goto repeat;
+ }
+
+ /*
+ * add_to_page_cache_lru locks the page, and for mmap we expect
+ * an unlocked page.
+ */
+ if (page && (fgp_flags & FGP_FOR_MMAP))
+ unlock_page(page);
+ }
+
+ return page;
+}
+EXPORT_SYMBOL(pagecache_get_page);
+
+/**
+ * find_get_entries - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page cache index
+ * @nr_entries: The maximum number of entries
+ * @entries: Where the resulting entries are placed
+ * @indices: The cache indices corresponding to the entries in @entries
+ *
+ * find_get_entries() will search for and return a group of up to
+ * @nr_entries entries in the mapping. The entries are placed at
+ * @entries. find_get_entries() takes a reference against any actual
+ * pages it returns.
+ *
+ * The search returns a group of mapping-contiguous page cache entries
+ * with ascending indexes. There may be holes in the indices due to
+ * not-present pages.
+ *
+ * Any shadow entries of evicted pages, or swap entries from
+ * shmem/tmpfs, are included in the returned array.
+ *
+ * If it finds a Transparent Huge Page, head or tail, find_get_entries()
+ * stops at that page: the caller is likely to have a better way to handle
+ * the compound page as a whole, and then skip its extent, than repeatedly
+ * calling find_get_entries() to return all its tails.
+ *
+ * Return: the number of pages and shadow entries which were found.
+ */
+unsigned find_get_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
+ goto export;
+
+ if (!page_cache_get_speculative(page))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
+ /*
+ * Terminate early on finding a THP, to allow the caller to
+ * handle it all at once; but continue if this is hugetlbfs.
+ */
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page = find_subpage(page, xas.xa_index);
+ nr_entries = ret + 1;
+ }
+export:
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * find_get_pages_range - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
+ *
+ * Return: the number of pages which were found. If this number is
+ * smaller than @nr_pages, the end of specified range has been
+ * reached.
+ */
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct page *page;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, end) {
+ if (xas_retry(&xas, page))
+ continue;
+ /* Skip over shadow, swap and DAX entries */
+ if (xa_is_value(page))
+ continue;
+
+ if (!page_cache_get_speculative(page))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
+ pages[ret] = find_subpage(page, xas.xa_index);
+ if (++ret == nr_pages) {
+ *start = xas.xa_index + 1;
+ goto out;
+ }
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
+ }
+
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping: The address_space to search
+ * @index: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * Return: the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+ unsigned int nr_pages, struct page **pages)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *page;
+ unsigned int ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * If the entry has been swapped out, we can stop looking.
+ * No current caller is looking for DAX entries.
+ */
+ if (xa_is_value(page))
+ break;
+
+ if (!page_cache_get_speculative(page))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
+ pages[ret] = find_subpage(page, xas.xa_index);
+ if (++ret == nr_pages)
+ break;
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_contig);
+
+/**
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
+ * @mapping: the address_space to search
+ * @index: the starting page index
+ * @end: The final page index (inclusive)
+ * @tag: the tag index
+ * @nr_pages: the maximum number of pages
+ * @pages: where the resulting pages are placed
+ *
+ * Like find_get_pages, except we only return pages which are tagged with
+ * @tag. We update @index to index the next page for the traversal.
+ *
+ * Return: the number of pages which were found.
+ */
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
+ struct page **pages)
+{
+ XA_STATE(xas, &mapping->i_pages, *index);
+ struct page *page;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, page, end, tag) {
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * Shadow entries should never be tagged, but this iteration
+ * is lockless so there is a window for page reclaim to evict
+ * a page we saw tagged. Skip over it.
+ */
+ if (xa_is_value(page))
+ continue;
+
+ if (!page_cache_get_speculative(page))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
+ pages[ret] = find_subpage(page, xas.xa_index);
+ if (++ret == nr_pages) {
+ *index = xas.xa_index + 1;
+ goto out;
+ }
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
+ }
+
+ /*
+ * We come here when we got to @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is a page at index -1 but that is already
+ * broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_range_tag);
+
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ * ---R__________________________________________B__________
+ * ^ reading here ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file_ra_state *ra)
+{
+ ra->ra_pages /= 4;
+}
+
+/**
+ * generic_file_buffered_read - generic file read routine
+ * @iocb: the iocb to read
+ * @iter: data destination
+ * @written: already copied
+ *
+ * This is a generic file read routine, and uses the
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ *
+ * Return:
+ * * total number of bytes copied, including those the were already @written
+ * * negative error code if nothing was copied
+ */
+ssize_t generic_file_buffered_read(struct kiocb *iocb,
+ struct iov_iter *iter, ssize_t written)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct file_ra_state *ra = &filp->f_ra;
+ loff_t *ppos = &iocb->ki_pos;
+ pgoff_t index;
+ pgoff_t last_index;
+ pgoff_t prev_index;
+ unsigned long offset; /* offset into pagecache page */
+ unsigned int prev_offset;
+ int error = 0;
+
+ if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ return 0;
+ if (unlikely(!iov_iter_count(iter)))
+ return 0;
+
+ iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+
+ index = *ppos >> PAGE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_SIZE-1);
+ last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
+
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if (written && (iocb->ki_flags & IOCB_WAITQ))
+ iocb->ki_flags |= IOCB_NOWAIT;
+
+ for (;;) {
+ struct page *page;
+ pgoff_t end_index;
+ loff_t isize;
+ unsigned long nr, ret;
+
+ cond_resched();
+find_page:
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ page = find_get_page(mapping, index);
+ if (!page) {
+ if (iocb->ki_flags & IOCB_NOIO)
+ goto would_block;
+ page_cache_sync_readahead(mapping,
+ ra, filp,
+ index, last_index - index);
+ page = find_get_page(mapping, index);
+ if (unlikely(page == NULL))
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_NOIO) {
+ put_page(page);
+ goto out;
+ }
+ page_cache_async_readahead(mapping,
+ ra, filp, page,
+ index, last_index - index);
+ }
+ if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+ error = wait_on_page_locked_killable(page);
+ }
+ if (unlikely(error))
+ goto readpage_error;
+ if (PageUptodate(page))
+ goto page_ok;
+
+ if (inode->i_blkbits == PAGE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ /* pipes can't handle partially uptodate pages */
+ if (unlikely(iov_iter_is_pipe(iter)))
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ offset, iter->count))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ }
+page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ put_page(page);
+ goto out;
+ }
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_SIZE;
+ if (index == end_index) {
+ nr = ((isize - 1) & ~PAGE_MASK) + 1;
+ if (nr <= offset) {
+ put_page(page);
+ goto out;
+ }
+ }
+ nr = nr - offset;
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ /*
+ * When a sequential read accesses a page several times,
+ * only mark it as accessed the first time.
+ */
+ if (prev_index != index || offset != prev_offset)
+ mark_page_accessed(page);
+ prev_index = index;
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
+
+ ret = copy_page_to_iter(page, offset, nr, iter);
+ offset += ret;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
+ prev_offset = offset;
+
+ put_page(page);
+ written += ret;
+ if (!iov_iter_count(iter))
+ goto out;
+ if (ret < nr) {
+ error = -EFAULT;
+ goto out;
+ }
+ continue;
+
+page_not_up_to_date:
+ /* Get exclusive access to the page ... */
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = lock_page_async(page, iocb->ki_waitq);
+ } else {
+ error = lock_page_killable(page);
+ }
+ if (unlikely(error))
+ goto readpage_error;
+
+page_not_up_to_date_locked:
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ continue;
+ }
+
+ /* Did somebody else fill it already? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto page_ok;
+ }
+
+readpage:
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ unlock_page(page);
+ put_page(page);
+ goto would_block;
+ }
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
+ /* Start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ put_page(page);
+ error = 0;
+ goto find_page;
+ }
+ goto readpage_error;
+ }
+
+ if (!PageUptodate(page)) {
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = lock_page_async(page, iocb->ki_waitq);
+ } else {
+ error = lock_page_killable(page);
+ }
+
+ if (unlikely(error))
+ goto readpage_error;
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_mapping_pages got it
+ */
+ unlock_page(page);
+ put_page(page);
+ goto find_page;
+ }
+ unlock_page(page);
+ shrink_readahead_size_eio(ra);
+ error = -EIO;
+ goto readpage_error;
+ }
+ unlock_page(page);
+ }
+
+ goto page_ok;
+
+readpage_error:
+ /* UHHUH! A synchronous read error occurred. Report it */
+ put_page(page);
+ goto out;
+
+no_cached_page:
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ page = page_cache_alloc(mapping);
+ if (!page) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = add_to_page_cache_lru(page, mapping, index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error) {
+ put_page(page);
+ if (error == -EEXIST) {
+ error = 0;
+ goto find_page;
+ }
+ goto out;
+ }
+ goto readpage;
+ }
+
+would_block:
+ error = -EAGAIN;
+out:
+ ra->prev_pos = prev_index;
+ ra->prev_pos <<= PAGE_SHIFT;
+ ra->prev_pos |= prev_offset;
+
+ *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
+ file_accessed(filp);
+ return written ? written : error;
+}
+EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+
+/**
+ * generic_file_read_iter - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the "read_iter()" routine for all filesystems
+ * that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead. When no data
+ * can be read, -EAGAIN shall be returned. When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ size_t count = iov_iter_count(iter);
+ ssize_t retval = 0;
+
+ if (!count)
+ goto out; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t size;
+
+ size = i_size_read(inode);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ retval = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (retval < 0)
+ goto out;
+ }
+
+ file_accessed(file);
+
+ retval = mapping->a_ops->direct_IO(iocb, iter);
+ if (retval >= 0) {
+ iocb->ki_pos += retval;
+ count -= retval;
+ }
+ iov_iter_revert(iter, count - iov_iter_count(iter));
+
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read. Buffered reads will not work for
+ * DAX files, so don't bother trying.
+ */
+ if (retval < 0 || !count || iocb->ki_pos >= size ||
+ IS_DAX(inode))
+ goto out;
+ }
+
+ retval = generic_file_buffered_read(iocb, iter, retval);
+out:
+ return retval;
+}
+EXPORT_SYMBOL(generic_file_read_iter);
+
+#ifdef CONFIG_MMU
+#define MMAP_LOTSAMISS (100)
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+ struct file **fpin)
+{
+ if (trylock_page(page))
+ return 1;
+
+ /*
+ * NOTE! This will make us return with VM_FAULT_RETRY, but with
+ * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * is supposed to work. We have way too many special cases..
+ */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
+ if (__lock_page_killable(page)) {
+ /*
+ * We didn't have the right flags to drop the mmap_lock,
+ * but all fault_handlers only check for fatal signals
+ * if we return VM_FAULT_RETRY, so we need to drop the
+ * mmap_lock here and return 0 if we don't have a fpin.
+ */
+ if (*fpin == NULL)
+ mmap_read_unlock(vmf->vma->vm_mm);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+}
+
+
+/*
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that. If we didn't pin a file then we return NULL. The file that is
+ * returned needs to be fput()'ed when we're done with it.
+ */
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
+ struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
+ struct file *fpin = NULL;
+ unsigned int mmap_miss;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
+ if (!ra->ra_pages)
+ return fpin;
+
+ if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
+ return fpin;
+ }
+
+ /* Avoid banging the cache line if not needed */
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss < MMAP_LOTSAMISS * 10)
+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (mmap_miss > MMAP_LOTSAMISS)
+ return fpin;
+
+ /*
+ * mmap read-around
+ */
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+ ra->size = ra->ra_pages;
+ ra->async_size = ra->ra_pages / 4;
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
+ return fpin;
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further. We return the file that
+ * was pinned if we have to drop the mmap_lock in order to do IO.
+ */
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
+ struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ unsigned int mmap_miss;
+ pgoff_t offset = vmf->pgoff;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
+ return fpin;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
+ if (PageReadahead(page)) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ page_cache_async_readahead(mapping, ra, file,
+ page, offset, ra->ra_pages);
+ }
+ return fpin;
+}
+
+/**
+ * filemap_fault - read in file data for page fault handling
+ * @vmf: struct vm_fault containing details of the fault
+ *
+ * filemap_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ *
+ * vma->vm_mm->mmap_lock must be held on entry.
+ *
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ *
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
+ * has not been released.
+ *
+ * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
+ */
+vm_fault_t filemap_fault(struct vm_fault *vmf)
+{
+ int error;
+ struct file *file = vmf->vma->vm_file;
+ struct file *fpin = NULL;
+ struct address_space *mapping = file->f_mapping;
+ struct file_ra_state *ra = &file->f_ra;
+ struct inode *inode = mapping->host;
+ pgoff_t offset = vmf->pgoff;
+ pgoff_t max_off;
+ struct page *page;
+ vm_fault_t ret = 0;
+
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+ page = find_get_page(mapping, offset);
+ if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ /*
+ * We found the page, so try async readahead before
+ * waiting for the lock.
+ */
+ fpin = do_async_mmap_readahead(vmf, page);
+ } else if (!page) {
+ /* No page in the page cache at all */
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+ ret = VM_FAULT_MAJOR;
+ fpin = do_sync_mmap_readahead(vmf);
+retry_find:
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT|FGP_FOR_MMAP,
+ vmf->gfp_mask);
+ if (!page) {
+ if (fpin)
+ goto out_retry;
+ return VM_FAULT_OOM;
+ }
+ }
+
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
+
+ /* Did it get truncated? */
+ if (unlikely(compound_head(page)->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+
+ /*
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error.
+ */
+ if (unlikely(!PageUptodate(page)))
+ goto page_not_uptodate;
+
+ /*
+ * We've made it this far and we had to drop our mmap_lock, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
+ * Found the page and have a reference on it.
+ * We must recheck i_size under page lock.
+ */
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off)) {
+ unlock_page(page);
+ put_page(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf->page = page;
+ return ret | VM_FAULT_LOCKED;
+
+page_not_uptodate:
+ /*
+ * Umm, take care of errors if the page isn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because there really aren't any performance issues here
+ * and we need to check for errors.
+ */
+ ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ error = -EIO;
+ }
+ if (fpin)
+ goto out_retry;
+ put_page(page);
+
+ if (!error || error == AOP_TRUNCATED_PAGE)
+ goto retry_find;
+
+ shrink_readahead_size_eio(ra);
+ return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_lock, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+void filemap_map_pages(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ pgoff_t last_pgoff = start_pgoff;
+ unsigned long max_idx;
+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
+ struct page *head, *page;
+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
+
+ rcu_read_lock();
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
+ continue;
+ if (xa_is_value(head))
+ goto next;
+
+ /*
+ * Check for a locked page first, as a speculative
+ * reference may adversely influence page migration.
+ */
+ if (PageLocked(head))
+ goto next;
+ if (!page_cache_get_speculative(head))
+ goto next;
+
+ /* Has the page moved or been split? */
+ if (unlikely(head != xas_reload(&xas)))
+ goto skip;
+ page = find_subpage(head, xas.xa_index);
+
+ if (!PageUptodate(head) ||
+ PageReadahead(page) ||
+ PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(head))
+ goto skip;
+
+ if (head->mapping != mapping || !PageUptodate(head))
+ goto unlock;
+
+ max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (xas.xa_index >= max_idx)
+ goto unlock;
+
+ if (mmap_miss > 0)
+ mmap_miss--;
+
+ vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += xas.xa_index - last_pgoff;
+ last_pgoff = xas.xa_index;
+ if (alloc_set_pte(vmf, page))
+ goto unlock;
+ unlock_page(head);
+ goto next;
+unlock:
+ unlock_page(head);
+skip:
+ put_page(head);
+next:
+ /* Huge page is mapped? No need to proceed. */
+ if (pmd_trans_huge(*vmf->pmd))
+ break;
+ }
+ rcu_read_unlock();
+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret = VM_FAULT_LOCKED;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+ /*
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
+ */
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+const struct vm_operations_struct generic_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+/*
+ * This is for filesystems which do not implement ->writepage.
+ */
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+ return generic_file_mmap(file, vma);
+}
+#else
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_MMU */
+
+EXPORT_SYMBOL(filemap_page_mkwrite);
+EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap);
+
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+static struct page *do_read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data,
+ gfp_t gfp)
+{
+ struct page *page;
+ int err;
+repeat:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(gfp);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ err = add_to_page_cache_lru(page, mapping, index, gfp);
+ if (unlikely(err)) {
+ put_page(page);
+ if (err == -EEXIST)
+ goto repeat;
+ /* Presumably ENOMEM for xarray node */
+ return ERR_PTR(err);
+ }
+
+filler:
+ if (filler)
+ err = filler(data, page);
+ else
+ err = mapping->a_ops->readpage(data, page);
+
+ if (err < 0) {
+ put_page(page);
+ return ERR_PTR(err);
+ }
+
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
+
+ /*
+ * Page is not up to date and may be locked due to one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
+ if (PageUptodate(page))
+ goto out;
+
+ /* Distinguish between all the cases under the safety of the lock */
+ lock_page(page);
+
+ /* Case c or d, restart the operation */
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto repeat;
+ }
+
+ /* Someone else locked and filled the page in a very small window */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto out;
+ }
+
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures.
+ * Clear page error before actual read, PG_error will be
+ * set again if read page fails.
+ */
+ ClearPageError(page);
+ goto filler;
+
+out:
+ mark_page_accessed(page);
+ return page;
+}
+
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page and wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
+{
+ return do_read_cache_page(mapping, index, filler, data,
+ mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(read_cache_page);
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
+{
+ return do_read_cache_page(mapping, index, NULL, NULL, gfp);
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
+/*
+ * Warn about a page cache invalidation failure during a direct I/O write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+ static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
+ char pathname[128];
+ struct inode *inode = file_inode(filp);
+ char *path;
+
+ errseq_set(&inode->i_mapping->wb_err, -EIO);
+ if (__ratelimit(&_rs)) {
+ path = file_path(filp, pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
+ pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+ current->comm);
+ }
+}
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written;
+ size_t write_len;
+ pgoff_t end;
+
+ write_len = iov_iter_count(from);
+ end = (pos + write_len - 1) >> PAGE_SHIFT;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* If there are pages to writeback, return */
+ if (filemap_range_has_page(inode->i_mapping, pos,
+ pos + write_len - 1))
+ return -EAGAIN;
+ } else {
+ written = filemap_write_and_wait_range(mapping, pos,
+ pos + write_len - 1);
+ if (written)
+ goto out;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ written = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
+ }
+
+ written = mapping->a_ops->direct_IO(iocb, from);
+
+ /*
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
+ *
+ * Most of the time we do not need this since dio_complete() will do
+ * the invalidation for us. However there are some file systems that
+ * do not end up with dio_complete() being called, so let's not break
+ * them by removing it completely.
+ *
+ * Noticeable example is a blkdev_direct_IO().
+ *
+ * Skip invalidation for async writes or if mapping has no pages.
+ */
+ if (written > 0 && mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
+ dio_warn_stale_pagecache(file);
+
+ if (written > 0) {
+ pos += written;
+ write_len -= written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ iocb->ki_pos = pos;
+ }
+ iov_iter_revert(from, write_len - iov_iter_count(from));
+out:
+ return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
+{
+ struct page *page;
+ int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
+
+ if (flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+
+ page = pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+ if (page)
+ wait_for_stable_page(page);
+
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = 0;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata = NULL;
+
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_count(i));
+
+again:
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+ if (unlikely(status < 0))
+ break;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ flush_dcache_page(page);
+
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+
+ balance_dirty_pages_ratelimited(mapping);
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+EXPORT_SYMBOL(generic_perform_write);
+
+/**
+ * __generic_file_write_iter - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * object which does not need locking at all.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_mutex.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t written = 0;
+ ssize_t err;
+ ssize_t status;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = inode_to_bdi(inode);
+ err = file_remove_privs(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos, endbyte;
+
+ written = generic_file_direct_write(iocb, from);
+ /*
+ * If the write stopped short of completing, fall back to
+ * buffered writes. Some filesystems do this for writes to
+ * holes, for example. For DAX files, a buffered write will
+ * not succeed (even if it did, DAX does not handle dirty
+ * page-cache pages correctly).
+ */
+ if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ goto out;
+
+ status = generic_perform_write(file, from, pos = iocb->ki_pos);
+ /*
+ * If generic_perform_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (unlikely(status < 0)) {
+ err = status;
+ goto out;
+ }
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + status - 1;
+ err = filemap_write_and_wait_range(mapping, pos, endbyte);
+ if (err == 0) {
+ iocb->ki_pos = endbyte + 1;
+ written += status;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_perform_write(file, from, iocb->ki_pos);
+ if (likely(written > 0))
+ iocb->ki_pos += written;
+ }
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+/**
+ * generic_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * This is a wrapper around __generic_file_write_iter() to be used by most
+ * filesystems. It takes care of syncing the file in case of O_SYNC file
+ * and acquires i_mutex as needed.
+ * Return:
+ * * negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * number of bytes written, even for truncated writes
+ */
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = __generic_file_write_iter(iocb, from);
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).
+ *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
+ *
+ * Return: %1 if the release was successful, otherwise return zero.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+ struct address_space * const mapping = page->mapping;
+
+ BUG_ON(!PageLocked(page));
+ if (PageWriteback(page))
+ return 0;
+
+ if (mapping && mapping->a_ops->releasepage)
+ return mapping->a_ops->releasepage(page, gfp_mask);
+ return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
new file mode 100644
index 000000000..1cd81d38a
--- /dev/null
+++ b/mm/frame_vector.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+/**
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start: starting user address
+ * @nr_frames: number of pages / pfns from start to map
+ * @gup_flags: flags modifying lookup behaviour
+ * @vec: structure which receives pages / pfns of the addresses mapped.
+ * It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ * When the function isn't able to map a single page, it returns error.
+ *
+ * Note that get_vaddr_frames() cannot follow VM_IO mappings. It used
+ * to be able to do that, but that could (racily) return non-refcounted
+ * pfns.
+ *
+ * This function takes care of grabbing mmap_lock as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+ unsigned int gup_flags, struct frame_vector *vec)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int ret = 0;
+ int locked;
+
+ if (nr_frames == 0)
+ return 0;
+
+ if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+ nr_frames = vec->nr_allocated;
+
+ start = untagged_addr(start);
+
+ mmap_read_lock(mm);
+ locked = 1;
+ vma = find_vma_intersection(mm, start, start + 1);
+ if (!vma) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * While get_vaddr_frames() could be used for transient (kernel
+ * controlled lifetime) pinning of memory pages all current
+ * users establish long term (userspace controlled lifetime)
+ * page pinning. Treat get_vaddr_frames() like
+ * get_user_pages_longterm() and disallow it for filesystem-dax
+ * mappings.
+ */
+ if (vma_is_fsdax(vma)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+ vec->got_ref = true;
+ vec->is_pfns = false;
+ ret = pin_user_pages_locked(start, nr_frames,
+ gup_flags, (struct page **)(vec->ptrs), &locked);
+ if (likely(ret > 0))
+ goto out;
+ }
+
+ vec->nr_frames = 0;
+
+out:
+ if (locked)
+ mmap_read_unlock(mm);
+ if (!ret)
+ ret = -EFAULT;
+ if (ret > 0)
+ vec->nr_frames = ret;
+ return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ * them
+ * @vec: frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+ struct page **pages;
+
+ if (!vec->got_ref)
+ goto out;
+ pages = frame_vector_pages(vec);
+ /*
+ * frame_vector_pages() might needed to do a conversion when
+ * get_vaddr_frames() got pages but vec was later converted to pfns.
+ * But it shouldn't really fail to convert pfns back...
+ */
+ if (WARN_ON(IS_ERR(pages)))
+ goto out;
+
+ unpin_user_pages(pages, vec->nr_frames);
+ vec->got_ref = false;
+out:
+ vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec: frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers. If the conversion is
+ * successful, return 0. Otherwise return an error. Note that we do not grab
+ * page references for the page structures.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+ int i;
+ unsigned long *nums;
+ struct page **pages;
+
+ if (!vec->is_pfns)
+ return 0;
+ nums = frame_vector_pfns(vec);
+ for (i = 0; i < vec->nr_frames; i++)
+ if (!pfn_valid(nums[i]))
+ return -EINVAL;
+ pages = (struct page **)nums;
+ for (i = 0; i < vec->nr_frames; i++)
+ pages[i] = pfn_to_page(nums[i]);
+ vec->is_pfns = false;
+ return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec: frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+ int i;
+ unsigned long *nums;
+ struct page **pages;
+
+ if (vec->is_pfns)
+ return;
+ pages = (struct page **)(vec->ptrs);
+ nums = (unsigned long *)pages;
+ for (i = 0; i < vec->nr_frames; i++)
+ nums[i] = page_to_pfn(pages[i]);
+ vec->is_pfns = true;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames: number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+ struct frame_vector *vec;
+ int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+ if (WARN_ON_ONCE(nr_frames == 0))
+ return NULL;
+ /*
+ * This is absurdly high. It's here just to avoid strange effects when
+ * arithmetics overflows.
+ */
+ if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
+ return NULL;
+ /*
+ * Avoid higher order allocations, use vmalloc instead. It should
+ * be rare anyway.
+ */
+ vec = kvmalloc(size, GFP_KERNEL);
+ if (!vec)
+ return NULL;
+ vec->nr_allocated = nr_frames;
+ vec->nr_frames = 0;
+ return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec: Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+ /* Make sure put_vaddr_frames() got called properly... */
+ VM_BUG_ON(vec->nr_frames > 0);
+ kvfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000..2183a56c7
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.rst for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
+
+/*
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions. Multiple implementations
+ * may be registered, but implementations can never deregister. This
+ * is a simple singly-linked list of all registered implementations.
+ */
+static struct frontswap_ops *frontswap_ops __read_mostly;
+
+#define for_each_frontswap_ops(ops) \
+ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ data_race(frontswap_loads++);
+}
+static inline void inc_frontswap_succ_stores(void) {
+ data_race(frontswap_succ_stores++);
+}
+static inline void inc_frontswap_failed_stores(void) {
+ data_race(frontswap_failed_stores++);
+}
+static inline void inc_frontswap_invalidates(void) {
+ data_race(frontswap_invalidates++);
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+
+/*
+ * Due to the asynchronous nature of the backends loading potentially
+ * _after_ the swap system has been activated, we have chokepoints
+ * on all frontswap functions to not call the backend until the backend
+ * has registered.
+ *
+ * This would not guards us against the user deciding to call swapoff right as
+ * we are calling the backend to initialize (so swapon is in action).
+ * Fortunately for us, the swapon_mutex has been taken by the callee so we are
+ * OK. The other scenario where calls to frontswap_store (called via
+ * swap_writepage) is racing with frontswap_invalidate_area (called via
+ * swapoff) is again guarded by the swap subsystem.
+ *
+ * While no backend is registered all calls to frontswap_[store|load|
+ * invalidate_area|invalidate_page] are ignored or fail.
+ *
+ * The time between the backend being registered and the swap file system
+ * calling the backend (via the frontswap_* functions) is indeterminate as
+ * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
+ * That is OK as we are comfortable missing some of these calls to the newly
+ * registered backend.
+ *
+ * Obviously the opposite (unloading the backend) must be done after all
+ * the frontswap_[store|load|invalidate_area|invalidate_page] start
+ * ignoring or failing the requests. However, there is currently no way
+ * to unload a backend once it is registered.
+ */
+
+/*
+ * Register operations for frontswap
+ */
+void frontswap_register_ops(struct frontswap_ops *ops)
+{
+ DECLARE_BITMAP(a, MAX_SWAPFILES);
+ DECLARE_BITMAP(b, MAX_SWAPFILES);
+ struct swap_info_struct *si;
+ unsigned int i;
+
+ bitmap_zero(a, MAX_SWAPFILES);
+ bitmap_zero(b, MAX_SWAPFILES);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (!WARN_ON(!si->frontswap_map))
+ set_bit(si->type, a);
+ }
+ spin_unlock(&swap_lock);
+
+ /* the new ops needs to know the currently active swap devices */
+ for_each_set_bit(i, a, MAX_SWAPFILES)
+ ops->init(i);
+
+ /*
+ * Setting frontswap_ops must happen after the ops->init() calls
+ * above; cmpxchg implies smp_mb() which will ensure the init is
+ * complete at this point.
+ */
+ do {
+ ops->next = frontswap_ops;
+ } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+ static_branch_inc(&frontswap_enabled_key);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (si->frontswap_map)
+ set_bit(si->type, b);
+ }
+ spin_unlock(&swap_lock);
+
+ /*
+ * On the very unlikely chance that a swap device was added or
+ * removed between setting the "a" list bits and the ops init
+ * calls, we re-check and do init or invalidate for any changed
+ * bits.
+ */
+ if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ if (!test_bit(i, a) && test_bit(i, b))
+ ops->init(i);
+ else if (test_bit(i, a) && !test_bit(i, b))
+ ops->invalidate_area(i);
+ }
+ }
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+ frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type, unsigned long *map)
+{
+ struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ VM_BUG_ON(sis == NULL);
+
+ /*
+ * p->frontswap is a bitmap that we MUST have to figure out which page
+ * has gone in frontswap. Without it there is no point of continuing.
+ */
+ if (WARN_ON(!map))
+ return;
+ /*
+ * Irregardless of whether the frontswap backend has been loaded
+ * before this function or it will be later, we _MUST_ have the
+ * p->frontswap set to something valid to work properly.
+ */
+ frontswap_map_set(sis, map);
+
+ for_each_frontswap_ops(ops)
+ ops->init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+bool __frontswap_test(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ if (sis->frontswap_map)
+ return test_bit(offset, sis->frontswap_map);
+ return false;
+}
+EXPORT_SYMBOL(__frontswap_test);
+
+static inline void __frontswap_set(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ set_bit(offset, sis->frontswap_map);
+ atomic_inc(&sis->frontswap_pages);
+}
+
+static inline void __frontswap_clear(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ clear_bit(offset, sis->frontswap_map);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
+
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(sis == NULL);
+
+ /*
+ * If a dup, we must remove the old page first; we can't leave the
+ * old page no matter if the store of the new page succeeds or fails,
+ * and we can't rely on the new page replacing the old page as we may
+ * not store to the same implementation that contains the old page.
+ */
+ if (__frontswap_test(sis, offset)) {
+ __frontswap_clear(sis, offset);
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ }
+
+ /* Try to store in each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->store(type, offset, page);
+ if (!ret) /* successful store */
+ break;
+ }
+ if (ret == 0) {
+ __frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ } else {
+ inc_frontswap_failed_stores();
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
+
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(sis == NULL);
+
+ if (!__frontswap_test(sis, offset))
+ return -1;
+
+ /* Try loading from each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->load(type, offset, page);
+ if (!ret) /* successful load */
+ break;
+ }
+ if (ret == 0) {
+ inc_frontswap_loads();
+ if (frontswap_tmem_exclusive_gets_enabled) {
+ SetPageDirty(page);
+ __frontswap_clear(sis, offset);
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(sis == NULL);
+
+ if (!__frontswap_test(sis, offset))
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ VM_BUG_ON(!frontswap_ops);
+ VM_BUG_ON(sis == NULL);
+
+ if (sis->frontswap_map == NULL)
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ bitmap_zero(sis->frontswap_map, sis->max);
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list)
+ totalpages += atomic_read(&si->frontswap_pages);
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+
+ assert_spin_locked(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = si->type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Used to check if it's necessary and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shrink pages,
+ * error code when there is an error.
+ */
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 1;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int type, ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_active_head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", 0444, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", 0444, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 000000000..11307a8b2
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,3040 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+
+#include <linux/mm.h>
+#include <linux/memremap.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <linux/sched/signal.h>
+#include <linux/rwsem.h>
+#include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+struct follow_page_context {
+ struct dev_pagemap *pgmap;
+ unsigned int page_mask;
+};
+
+static void hpage_pincount_add(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+ atomic_add(refs, compound_pincount_ptr(page));
+}
+
+static void hpage_pincount_sub(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+ atomic_sub(refs, compound_pincount_ptr(page));
+}
+
+/* Equivalent to calling put_page() @refs times. */
+static void put_page_refs(struct page *page, int refs)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
+ return;
+#endif
+
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+
+ /*
+ * At this point we have a stable reference to the head page; but it
+ * could be that between the compound_head() lookup and the refcount
+ * increment, the compound page was split, in which case we'd end up
+ * holding a reference on a page that has nothing to do with the page
+ * we were given anymore.
+ * So now that the head page is stable, recheck that the pages still
+ * belong together.
+ */
+ if (unlikely(compound_head(page) != head)) {
+ put_page_refs(head, refs);
+ return NULL;
+ }
+
+ return head;
+}
+
+/*
+ * try_grab_compound_head() - attempt to elevate a page's refcount, by a
+ * flags-dependent amount.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
+ * same time. (That's true throughout the get_user_pages*() and
+ * pin_user_pages*() APIs.) Cases:
+ *
+ * FOLL_GET: page's refcount will be incremented by 1.
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: head page (with refcount appropriately incremented) for success, or
+ * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
+ * considered failure, and furthermore, a likely bug in the caller, so a warning
+ * is also emitted.
+ */
+static __maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs,
+ unsigned int flags)
+{
+ if (flags & FOLL_GET)
+ return try_get_compound_head(page, refs);
+ else if (flags & FOLL_PIN) {
+ int orig_refs = refs;
+
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
+ * path, so fail and let the caller fall back to the slow path.
+ */
+ if (unlikely(flags & FOLL_LONGTERM) &&
+ is_migrate_cma_page(page))
+ return NULL;
+
+ /*
+ * CAUTION: Don't use compound_head() on the page before this
+ * point, the result won't be stable.
+ */
+ page = try_get_compound_head(page, refs);
+ if (!page)
+ return NULL;
+
+ /*
+ * When pinning a compound page of order > 1 (which is what
+ * hpage_pincount_available() checks for), use an exact count to
+ * track it, via hpage_pincount_add/_sub().
+ *
+ * However, be sure to *also* increment the normal page refcount
+ * field at least once, so that the page really is pinned.
+ */
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, refs);
+ else
+ page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
+ orig_refs);
+
+ return page;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+static void put_compound_head(struct page *page, int refs, unsigned int flags)
+{
+ if (flags & FOLL_PIN) {
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
+ refs);
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_sub(page, refs);
+ else
+ refs *= GUP_PIN_COUNTING_BIAS;
+ }
+
+ put_page_refs(page, refs);
+}
+
+/**
+ * try_grab_page() - elevate a page's refcount by a flag-dependent amount
+ *
+ * This might not do anything at all, depending on the flags argument.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ *
+ * @page: pointer to page to be grabbed
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
+ * time. Cases:
+ *
+ * FOLL_GET: page's refcount will be incremented by 1.
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: true for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
+ * FOLL_PIN was set, but the page could not be grabbed.
+ */
+bool __must_check try_grab_page(struct page *page, unsigned int flags)
+{
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+
+ if (flags & FOLL_GET)
+ return try_get_page(page);
+ else if (flags & FOLL_PIN) {
+ int refs = 1;
+
+ page = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+ return false;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ /*
+ * Similar to try_grab_compound_head(): even if using the
+ * hpage_pincount_add/_sub() routines, be sure to
+ * *also* increment the normal page refcount field at least
+ * once, so that the page really is pinned.
+ */
+ page_ref_add(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+ }
+
+ return true;
+}
+
+/**
+ * unpin_user_page() - release a dma-pinned page
+ * @page: pointer to page to be released
+ *
+ * Pages that were pinned via pin_user_pages*() must be released via either
+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
+ * that such pages can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special handling.
+ */
+void unpin_user_page(struct page *page)
+{
+ put_compound_head(compound_head(page), 1, FOLL_PIN);
+}
+EXPORT_SYMBOL(unpin_user_page);
+
+/**
+ * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
+ * @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using unpin_user_page(),
+ * possibly via unpin_user_pages(), for the non-dirty case.
+ *
+ * Please see the unpin_user_page() documentation for details.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+
+ if (!make_dirty) {
+ unpin_user_pages(pages, npages);
+ return;
+ }
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key
+ * cases:
+ *
+ * 1) This code sees the page as already dirty, so it
+ * skips the call to set_page_dirty(). That could happen
+ * because clear_page_dirty_for_io() called
+ * page_mkclean(), followed by set_page_dirty().
+ * However, now the page is going to get written back,
+ * which meets the original intention of setting it
+ * dirty, so all is well: clear_page_dirty_for_io() goes
+ * on to call TestClearPageDirty(), and write the page
+ * back.
+ *
+ * 2) This code sees the page as clean, so it calls
+ * set_page_dirty(). The page stays dirty, despite being
+ * written back, so it gets written back again in the
+ * next writeback cycle. This is harmless.
+ */
+ if (!PageDirty(page))
+ set_page_dirty_lock(page);
+ unpin_user_page(page);
+ }
+}
+EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
+
+/**
+ * unpin_user_pages() - release an array of gup-pinned pages.
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using unpin_user_page().
+ *
+ * Please see the unpin_user_page() documentation for details.
+ */
+void unpin_user_pages(struct page **pages, unsigned long npages)
+{
+ unsigned long index;
+
+ /*
+ * If this WARN_ON() fires, then the system *might* be leaking pages (by
+ * leaving them pinned), but probably not. More likely, gup/pup returned
+ * a hard -ERRNO error to the caller, who erroneously passed it here.
+ */
+ if (WARN_ON(IS_ERR_VALUE(npages)))
+ return;
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+ for (index = 0; index < npages; index++)
+ unpin_user_page(pages[index]);
+}
+EXPORT_SYMBOL(unpin_user_pages);
+
+#ifdef CONFIG_MMU
+static struct page *no_page_table(struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate unnecessary pages or
+ * page tables. Return error instead of NULL to skip handle_mm_fault,
+ * then get_dump_page() will return NULL to leave a hole in the dump.
+ * But we can only make this optimization where a hole would surely
+ * be zero-filled if handle_mm_fault() actually did handle it.
+ */
+ if ((flags & FOLL_DUMP) &&
+ (vma_is_anonymous(vma) || !vma->vm_ops->fault))
+ return ERR_PTR(-EFAULT);
+ return NULL;
+}
+
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, unsigned int flags)
+{
+ /* No page to get reference */
+ if (flags & FOLL_GET)
+ return -EFAULT;
+
+ if (flags & FOLL_TOUCH) {
+ pte_t entry = *pte;
+
+ if (flags & FOLL_WRITE)
+ entry = pte_mkdirty(entry);
+ entry = pte_mkyoung(entry);
+
+ if (!pte_same(*pte, entry)) {
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ update_mmu_cache(vma, address, pte);
+ }
+ }
+
+ /* Proper page table entry exists, but no corresponding struct page */
+ return -EEXIST;
+}
+
+/*
+ * FOLL_FORCE can write to even unwritable pte's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+{
+ return pte_write(pte) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+}
+
+static struct page *follow_page_pte(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags,
+ struct dev_pagemap **pgmap)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int ret;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+ * ARM64 architecture.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
+retry:
+ if (unlikely(pmd_bad(*pmd)))
+ return no_page_table(vma, flags);
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = *ptep;
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+ /*
+ * KSM's break_ksm() relies upon recognizing a ksm page
+ * even while it is being migrated, so for that case we
+ * need migration_entry_wait().
+ */
+ if (likely(!(flags & FOLL_MIGRATION)))
+ goto no_page;
+ if (pte_none(pte))
+ goto no_page;
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto no_page;
+ pte_unmap_unlock(ptep, ptl);
+ migration_entry_wait(mm, pmd, address);
+ goto retry;
+ }
+ if ((flags & FOLL_NUMA) && pte_protnone(pte))
+ goto no_page;
+ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
+ pte_unmap_unlock(ptep, ptl);
+ return NULL;
+ }
+
+ page = vm_normal_page(vma, address, pte);
+ if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+ /*
+ * Only return device mapping pages in the FOLL_GET or FOLL_PIN
+ * case since they are only valid while holding the pgmap
+ * reference.
+ */
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+ if (*pgmap)
+ page = pte_page(pte);
+ else
+ goto no_page;
+ } else if (unlikely(!page)) {
+ if (flags & FOLL_DUMP) {
+ /* Avoid special (like zero) pages in core dumps */
+ page = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
+ if (is_zero_pfn(pte_pfn(pte))) {
+ page = pte_page(pte);
+ } else {
+ ret = follow_pfn_pte(vma, address, ptep, flags);
+ page = ERR_PTR(ret);
+ goto out;
+ }
+ }
+
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
+ /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
+ if (unlikely(!try_grab_page(page, flags))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ /*
+ * We need to make the page accessible if and only if we are going
+ * to access its content (the FOLL_PIN case). Please see
+ * Documentation/core-api/pin_user_pages.rst for details.
+ */
+ if (flags & FOLL_PIN) {
+ ret = arch_make_page_accessible(page);
+ if (ret) {
+ unpin_user_page(page);
+ page = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ /*
+ * pte_mkyoung() would be more correct here, but atomic care
+ * is needed to avoid losing the dirty bit: it is easier to use
+ * mark_page_accessed().
+ */
+ mark_page_accessed(page);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+out:
+ pte_unmap_unlock(ptep, ptl);
+ return page;
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return NULL;
+ return no_page_table(vma, flags);
+}
+
+static struct page *follow_pmd_mask(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ pmd_t *pmd, pmdval;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ pmd = pmd_offset(pudp, address);
+ /*
+ * The READ_ONCE() will stabilize the pmdval in a register or
+ * on the stack so that it will stop changing under the code.
+ */
+ pmdval = READ_ONCE(*pmd);
+ if (pmd_none(pmdval))
+ return no_page_table(vma, flags);
+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pmd_val(pmdval)), flags,
+ PMD_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+retry:
+ if (!pmd_present(pmdval)) {
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(pmdval));
+ if (is_pmd_migration_entry(pmdval))
+ pmd_migration_entry_wait(mm, pmd);
+ pmdval = READ_ONCE(*pmd);
+ /*
+ * MADV_DONTNEED may convert the pmd to null because
+ * mmap_lock is held in read mode
+ */
+ if (pmd_none(pmdval))
+ return no_page_table(vma, flags);
+ goto retry;
+ }
+ if (pmd_devmap(pmdval)) {
+ ptl = pmd_lock(mm, pmd);
+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (likely(!pmd_trans_huge(pmdval)))
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+
+ if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
+ return no_page_table(vma, flags);
+
+retry_locked:
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(pmd_none(*pmd))) {
+ spin_unlock(ptl);
+ return no_page_table(vma, flags);
+ }
+ if (unlikely(!pmd_present(*pmd))) {
+ spin_unlock(ptl);
+ if (likely(!(flags & FOLL_MIGRATION)))
+ return no_page_table(vma, flags);
+ pmd_migration_entry_wait(mm, pmd);
+ goto retry_locked;
+ }
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ }
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ if (pmd_trans_unstable(pmd))
+ ret = -EBUSY;
+ } else if (flags & FOLL_SPLIT) {
+ if (unlikely(!try_get_page(page))) {
+ spin_unlock(ptl);
+ return ERR_PTR(-ENOMEM);
+ }
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (pmd_none(*pmd))
+ return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ }
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ ctx->page_mask = HPAGE_PMD_NR - 1;
+ return page;
+}
+
+static struct page *follow_pud_mask(struct vm_area_struct *vma,
+ unsigned long address, p4d_t *p4dp,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ pud_t *pud;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ pud = pud_offset(p4dp, address);
+ if (pud_none(*pud))
+ return no_page_table(vma, flags);
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pud_val(*pud)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pud_val(*pud)), flags,
+ PUD_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (pud_devmap(*pud)) {
+ ptl = pud_lock(mm, pud);
+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (unlikely(pud_bad(*pud)))
+ return no_page_table(vma, flags);
+
+ return follow_pmd_mask(vma, address, pud, flags, ctx);
+}
+
+static struct page *follow_p4d_mask(struct vm_area_struct *vma,
+ unsigned long address, pgd_t *pgdp,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ p4d_t *p4d;
+ struct page *page;
+
+ p4d = p4d_offset(pgdp, address);
+ if (p4d_none(*p4d))
+ return no_page_table(vma, flags);
+ BUILD_BUG_ON(p4d_huge(*p4d));
+ if (unlikely(p4d_bad(*p4d)))
+ return no_page_table(vma, flags);
+
+ if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(p4d_val(*p4d)), flags,
+ P4D_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ return follow_pud_mask(vma, address, p4d, flags, ctx);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
+ * pointer to output page_mask
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
+ *
+ * On output, the @ctx->page_mask is set according to the size of the page.
+ *
+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+static struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ pgd_t *pgd;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ ctx->page_mask = 0;
+
+ /* make this handle hugepd */
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ return page;
+ }
+
+ pgd = pgd_offset(mm, address);
+
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ return no_page_table(vma, flags);
+
+ if (pgd_huge(*pgd)) {
+ page = follow_huge_pgd(mm, address, pgd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+ if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
+ page = follow_huge_pd(vma, address,
+ __hugepd(pgd_val(*pgd)), flags,
+ PGDIR_SHIFT);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ struct follow_page_context ctx = { NULL };
+ struct page *page;
+
+ page = follow_page_mask(vma, address, foll_flags, &ctx);
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return page;
+}
+
+static int get_gate_page(struct mm_struct *mm, unsigned long address,
+ unsigned int gup_flags, struct vm_area_struct **vma,
+ struct page **page)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int ret = -EFAULT;
+
+ /* user gate pages are read-only */
+ if (gup_flags & FOLL_WRITE)
+ return -EFAULT;
+ if (address > TASK_SIZE)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = pgd_offset_gate(mm, address);
+ if (pgd_none(*pgd))
+ return -EFAULT;
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d))
+ return -EFAULT;
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud))
+ return -EFAULT;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map(pmd, address);
+ if (pte_none(*pte))
+ goto unmap;
+ *vma = get_gate_vma(mm);
+ if (!page)
+ goto out;
+ *page = vm_normal_page(*vma, address, *pte);
+ if (!*page) {
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ goto unmap;
+ *page = pte_page(*pte);
+ }
+ if (unlikely(!try_grab_page(*page, gup_flags))) {
+ ret = -ENOMEM;
+ goto unmap;
+ }
+out:
+ ret = 0;
+unmap:
+ pte_unmap(pte);
+ return ret;
+}
+
+/*
+ * mmap_lock must be held on entry. If @locked != NULL and *@flags
+ * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
+ * is, *@locked will be set to 0 and -EBUSY returned.
+ */
+static int faultin_page(struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *locked)
+{
+ unsigned int fault_flags = 0;
+ vm_fault_t ret;
+
+ /* mlock all present pages, but do not fault in new pages */
+ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+ return -ENOENT;
+ if (*flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (*flags & FOLL_REMOTE)
+ fault_flags |= FAULT_FLAG_REMOTE;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ if (*flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+ if (*flags & FOLL_TRIED) {
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
+ * can co-exist
+ */
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
+ if (ret & VM_FAULT_ERROR) {
+ int err = vm_fault_to_errno(ret, *flags);
+
+ if (err)
+ return err;
+ BUG();
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ *locked = 0;
+ return -EBUSY;
+ }
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
+ * necessary, even if maybe_mkwrite decided not to set pte_write. We
+ * can thus safely do subsequent page lookups as if they were reads.
+ * But only do so when looping for pte_write is futile: in some cases
+ * userspace may also be wanting to write to the gotten user page,
+ * which a read fault here might prevent (a readonly page might get
+ * reCOWed by userspace write).
+ */
+ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
+ *flags |= FOLL_COW;
+ return 0;
+}
+
+static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+ int write = (gup_flags & FOLL_WRITE);
+ int foreign = (gup_flags & FOLL_REMOTE);
+
+ if (vm_flags & (VM_IO | VM_PFNMAP))
+ return -EFAULT;
+
+ if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ return -EFAULT;
+
+ if (write) {
+ if (!(vm_flags & VM_WRITE)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * We used to let the write,force case do COW in a
+ * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
+ * set a breakpoint in a read-only mapping of an
+ * executable, without corrupting the file (yet only
+ * when that file had been opened for writing!).
+ * Anon pages in shared mappings are surprising: now
+ * just reject it.
+ */
+ if (!is_cow_mapping(vm_flags))
+ return -EFAULT;
+ }
+ } else if (!(vm_flags & VM_READ)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * Is there actually any vma we can reach here which does not
+ * have VM_MAYREAD set?
+ */
+ if (!(vm_flags & VM_MAYREAD))
+ return -EFAULT;
+ }
+ /*
+ * gups are always data accesses, not instruction
+ * fetches, so execute=false here
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: whether we're still with the mmap_lock held
+ *
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ * -- 0 return value is possible when the fault would need to be retried.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_lock is held.
+ *
+ * Must be called with mmap_lock held. It may be released. See below.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
+ * released by an up_read(). That can happen if @gup_flags does not
+ * have FOLL_NOWAIT.
+ *
+ * A caller using such a combination of @locked and @gup_flags
+ * must therefore hold the mmap_lock for reading only, and recognize
+ * when it's been released. Otherwise, it must be held for either
+ * reading or writing and will not be released.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
+static long __get_user_pages(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ long ret = 0, i = 0;
+ struct vm_area_struct *vma = NULL;
+ struct follow_page_context ctx = { NULL };
+
+ if (!nr_pages)
+ return 0;
+
+ start = untagged_addr(start);
+
+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
+
+ /*
+ * If FOLL_FORCE is set then do not force a full fault as the hinting
+ * fault information is unrelated to the reference behaviour of a task
+ * using the address space
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
+ do {
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+ pages ? &pages[i] : NULL);
+ if (ret)
+ goto out;
+ ctx.page_mask = 0;
+ goto next_page;
+ }
+
+ if (!vma || check_vma_flags(vma, gup_flags)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i,
+ gup_flags, locked);
+ if (locked && *locked == 0) {
+ /*
+ * We've got a VM_FAULT_RETRY
+ * and we've lost mmap_lock.
+ * We must stop here.
+ */
+ BUG_ON(gup_flags & FOLL_NOWAIT);
+ BUG_ON(ret != 0);
+ goto out;
+ }
+ continue;
+ }
+ }
+retry:
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ cond_resched();
+
+ page = follow_page_mask(vma, start, foll_flags, &ctx);
+ if (!page) {
+ ret = faultin_page(vma, start, &foll_flags, locked);
+ switch (ret) {
+ case 0:
+ goto retry;
+ case -EBUSY:
+ ret = 0;
+ fallthrough;
+ case -EFAULT:
+ case -ENOMEM:
+ case -EHWPOISON:
+ goto out;
+ case -ENOENT:
+ goto next_page;
+ }
+ BUG();
+ } else if (PTR_ERR(page) == -EEXIST) {
+ /*
+ * Proper page table entry exists, but no corresponding
+ * struct page.
+ */
+ goto next_page;
+ } else if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+ if (pages) {
+ pages[i] = page;
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ ctx.page_mask = 0;
+ }
+next_page:
+ if (vmas) {
+ vmas[i] = vma;
+ ctx.page_mask = 0;
+ }
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
+ } while (nr_pages);
+out:
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return i ? i : ret;
+}
+
+static bool vma_permits_fault(struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ bool write = !!(fault_flags & FAULT_FLAG_WRITE);
+ bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
+ vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
+
+ if (!(vm_flags & vma->vm_flags))
+ return false;
+
+ /*
+ * The architecture might have a hardware protection
+ * mechanism other than read/write that can deny access.
+ *
+ * gup always represents data access, not instruction
+ * fetches, so execute=false here:
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return false;
+
+ return true;
+}
+
+/**
+ * fixup_user_fault() - manually resolve a user page fault
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
+ * does not allow retry. If NULL, the caller must guarantee
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * get_user_pages() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This function will not return with an unlocked mmap_lock. So it has not the
+ * same semantics wrt the @mm->mmap_lock as does filemap_fault().
+ */
+int fixup_user_fault(struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags,
+ bool *unlocked)
+{
+ struct vm_area_struct *vma;
+ vm_fault_t ret, major = 0;
+
+ address = untagged_addr(address);
+
+ if (unlocked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+retry:
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ if (!vma_permits_fault(vma, fault_flags))
+ return -EFAULT;
+
+ if ((fault_flags & FAULT_FLAG_KILLABLE) &&
+ fatal_signal_pending(current))
+ return -EINTR;
+
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
+ major |= ret & VM_FAULT_MAJOR;
+ if (ret & VM_FAULT_ERROR) {
+ int err = vm_fault_to_errno(ret, 0);
+
+ if (err)
+ return err;
+ BUG();
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ mmap_read_lock(mm);
+ *unlocked = true;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fixup_user_fault);
+
+/*
+ * Please note that this function, unlike __get_user_pages will not
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ */
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ int *locked,
+ unsigned int flags)
+{
+ long ret, pages_done;
+ bool lock_dropped;
+
+ if (locked) {
+ /* if VM_FAULT_RETRY can be returned, vmas become invalid */
+ BUG_ON(vmas);
+ /* check caller initialized locked */
+ BUG_ON(*locked != 1);
+ }
+
+ if (flags & FOLL_PIN)
+ atomic_set(&mm->has_pinned, 1);
+
+ /*
+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
+ * for FOLL_GET, not for the newer FOLL_PIN.
+ *
+ * FOLL_PIN always expects pages to be non-null, but no need to assert
+ * that here, as any failures will be obvious enough.
+ */
+ if (pages && !(flags & FOLL_PIN))
+ flags |= FOLL_GET;
+
+ pages_done = 0;
+ lock_dropped = false;
+ for (;;) {
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
+ vmas, locked);
+ if (!locked)
+ /* VM_FAULT_RETRY couldn't trigger, bypass */
+ return ret;
+
+ /* VM_FAULT_RETRY cannot return errors */
+ if (!*locked) {
+ BUG_ON(ret < 0);
+ BUG_ON(ret >= nr_pages);
+ }
+
+ if (ret > 0) {
+ nr_pages -= ret;
+ pages_done += ret;
+ if (!nr_pages)
+ break;
+ }
+ if (*locked) {
+ /*
+ * VM_FAULT_RETRY didn't trigger or it was a
+ * FOLL_NOWAIT.
+ */
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ /*
+ * VM_FAULT_RETRY triggered, so seek to the faulting offset.
+ * For the prefault case (!pages) we only update counts.
+ */
+ if (likely(pages))
+ pages += ret;
+ start += ret << PAGE_SHIFT;
+ lock_dropped = true;
+
+retry:
+ /*
+ * Repeat on the address that fired VM_FAULT_RETRY
+ * with both FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED. Note that GUP can be interrupted
+ * by fatal signals, so we need to check it before we
+ * start trying again otherwise it can loop forever.
+ */
+
+ if (fatal_signal_pending(current)) {
+ if (!pages_done)
+ pages_done = -EINTR;
+ break;
+ }
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ BUG_ON(ret > 0);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+
+ *locked = 1;
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
+ pages, NULL, locked);
+ if (!*locked) {
+ /* Continue to retry until we succeeded */
+ BUG_ON(ret != 0);
+ goto retry;
+ }
+ if (ret != 1) {
+ BUG_ON(ret > 1);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ nr_pages--;
+ pages_done++;
+ if (!nr_pages)
+ break;
+ if (likely(pages))
+ pages++;
+ start += PAGE_SIZE;
+ }
+ if (lock_dropped && *locked) {
+ /*
+ * We must let the caller know we temporarily dropped the lock
+ * and so the critical section protected by it was lost.
+ */
+ mmap_read_unlock(mm);
+ *locked = 0;
+ }
+ return pages_done;
+}
+
+/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @locked: whether the mmap_lock is still held
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * Return either number of pages pinned in the vma, or a negative error
+ * code on error.
+ *
+ * vma->vm_mm->mmap_lock must be held.
+ *
+ * If @locked is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @locked is non-NULL, it must held for read only and may be
+ * released. If it's released, *@locked will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *locked)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ mmap_assert_locked(mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ if (vma->vm_flags & VM_LOCKONFAULT)
+ gup_flags &= ~FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma_is_accessible(vma))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, NULL, locked);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_lock must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ mmap_read_unlock(mm);
+ return ret; /* 0 or negative error code */
+}
+#else /* CONFIG_MMU */
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ struct vm_area_struct **vmas, int *locked,
+ unsigned int foll_flags)
+{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
+ int i;
+
+ /* calculate required read or write permissions.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
+ */
+ vm_flags = (foll_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ for (i = 0; i < nr_pages; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+ goto finish_or_fault;
+
+ if (pages) {
+ pages[i] = virt_to_page(start);
+ if (pages[i])
+ get_page(pages[i]);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ }
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
+}
+#endif /* !CONFIG_MMU */
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ int locked = 1;
+ int ret;
+
+ if (mmap_read_lock_killable(mm))
+ return NULL;
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (locked)
+ mmap_read_unlock(mm);
+ return (ret == 1) ? page : NULL;
+}
+#endif /* CONFIG_ELF_CORE */
+
+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ long i;
+ struct vm_area_struct *vma_prev = NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ return true;
+ }
+ return false;
+}
+
+#ifdef CONFIG_CMA
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ unsigned long i, isolation_error_count;
+ bool drain_allow;
+ LIST_HEAD(cma_page_list);
+ long ret = nr_pages;
+ struct page *prev_head, *head;
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ };
+
+check_again:
+ prev_head = NULL;
+ isolation_error_count = 0;
+ drain_allow = true;
+ for (i = 0; i < nr_pages; i++) {
+ head = compound_head(pages[i]);
+ if (head == prev_head)
+ continue;
+ prev_head = head;
+ /*
+ * If we get a page from the CMA zone, since we are going to
+ * be pinning these entries, we might as well move them out
+ * of the CMA zone if possible.
+ */
+ if (is_migrate_cma_page(head)) {
+ if (PageHuge(head)) {
+ if (isolate_hugetlb(head, &cma_page_list))
+ isolation_error_count++;
+ } else {
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (isolate_lru_page(head)) {
+ isolation_error_count++;
+ continue;
+ }
+ list_add_tail(&head->lru, &cma_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_lru(head),
+ thp_nr_pages(head));
+ }
+ }
+ }
+
+ /*
+ * If list is empty, and no isolation errors, means that all pages are
+ * in the correct zone.
+ */
+ if (list_empty(&cma_page_list) && !isolation_error_count)
+ return ret;
+
+ if (!list_empty(&cma_page_list)) {
+ /*
+ * drop the above get_user_pages reference.
+ */
+ if (gup_flags & FOLL_PIN)
+ unpin_user_pages(pages, nr_pages);
+ else
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ ret = migrate_pages(&cma_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_CONTIG_RANGE);
+ if (ret) {
+ if (!list_empty(&cma_page_list))
+ putback_movable_pages(&cma_page_list);
+ return ret > 0 ? -ENOMEM : ret;
+ }
+
+ /* We unpinned pages before migration, pin them again */
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, gup_flags);
+ if (ret <= 0)
+ return ret;
+ nr_pages = ret;
+ }
+
+ /*
+ * check again because pages were unpinned, and we also might have
+ * had isolation errors and need more pages to migrate.
+ */
+ goto check_again;
+}
+#else
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ return nr_pages;
+}
+#endif /* CONFIG_CMA */
+
+/*
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
+ */
+static long __gup_longterm_locked(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ struct vm_area_struct **vmas_tmp = vmas;
+ unsigned long flags = 0;
+ long rc, i;
+
+ if (gup_flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
+ flags = memalloc_nocma_save();
+ }
+
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages,
+ vmas_tmp, NULL, gup_flags);
+
+ if (gup_flags & FOLL_LONGTERM) {
+ if (rc < 0)
+ goto out;
+
+ if (check_dax_vmas(vmas_tmp, rc)) {
+ if (gup_flags & FOLL_PIN)
+ unpin_user_pages(pages, rc);
+ else
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
+ vmas_tmp, gup_flags);
+out:
+ memalloc_nocma_restore(flags);
+ }
+
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
+ return rc;
+}
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+static bool is_valid_gup_flags(unsigned int gup_flags)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return false;
+ /*
+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
+ * FOLL_PIN.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_MMU
+static long __get_user_pages_remote(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * Parts of FOLL_LONGTERM behavior are incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. However, this only comes up if locked is set, and there are
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
+ * allow what we can.
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ if (WARN_ON_ONCE(locked))
+ return -EINVAL;
+ /*
+ * This will check the vmas (even if our vmas arg is NULL)
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
+ */
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
+ vmas, gup_flags | FOLL_TOUCH |
+ FOLL_REMOTE);
+ }
+
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+
+/**
+ * get_user_pages_remote() - pin user pages in memory
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_lock is held.
+ *
+ * Must be called with mmap_lock held for read or write.
+ *
+ * get_user_pages_remote walks a process's page tables and takes a reference
+ * to each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages_remote returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
+ * be called after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
+ * to get a handle on the memory by some means other than accesses
+ * via the user virtual addresses. The pages may be submitted for
+ * DMA to devices or accessed via their kernel linear mapping (via the
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages_remote should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages_remote because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
+ */
+long get_user_pages_remote(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ if (!is_valid_gup_flags(gup_flags))
+ return -EINVAL;
+
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
+ pages, vmas, locked);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+#else /* CONFIG_MMU */
+long get_user_pages_remote(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
+
+static long __get_user_pages_remote(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
+#endif /* !CONFIG_MMU */
+
+/**
+ * get_user_pages() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter. We also
+ * obviously don't pass FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ if (!is_valid_gup_flags(gup_flags))
+ return -EINVAL;
+
+ return __gup_longterm_locked(current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+/**
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ * mmap_read_lock(mm);
+ * do_something()
+ * get_user_pages(mm, ..., pages, NULL);
+ * mmap_read_unlock(mm);
+ *
+ * to:
+ *
+ * int locked = 1;
+ * mmap_read_lock(mm);
+ * do_something()
+ * get_user_pages_locked(mm, ..., pages, &locked);
+ * if (locked)
+ * mmap_read_unlock(mm);
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ */
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked)
+{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ return __get_user_pages_locked(current->mm, start, nr_pages,
+ pages, NULL, locked,
+ gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ * mmap_read_lock(mm);
+ * get_user_pages(mm, ..., pages, NULL);
+ * mmap_read_unlock(mm);
+ *
+ * with:
+ *
+ * get_user_pages_unlocked(mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
+ */
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ struct mm_struct *mm = current->mm;
+ int locked = 1;
+ long ret;
+
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ mmap_read_lock(mm);
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
+ &locked, gup_flags | FOLL_TOUCH);
+ if (locked)
+ mmap_read_unlock(mm);
+ return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
+/*
+ * Fast GUP
+ *
+ * get_user_pages_fast attempts to pin user pages by walking the page
+ * tables directly and avoids taking locks. Thus the walker needs to be
+ * protected from page table pages being freed from under it, and should
+ * block any THP splits.
+ *
+ * One way to achieve this is to have the walker disable interrupts, and
+ * rely on IPIs from the TLB flushing code blocking before the page table
+ * pages are freed. This is unsuitable for architectures that do not need
+ * to broadcast an IPI when invalidating TLBs.
+ *
+ * Another way to achieve this is to batch up page table containing pages
+ * belonging to more than one mm_user, then rcu_sched a callback to free those
+ * pages. Disabling interrupts will allow the fast_gup walker to both block
+ * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
+ * (which is a relatively rare event). The code below adopts this strategy.
+ *
+ * Before activating this code, please be aware that the following assumptions
+ * are currently made:
+ *
+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
+ *
+ * *) ptes can be read atomically by the architecture.
+ *
+ * *) access_ok is sufficient to validate userspace address ranges.
+ *
+ * The last two assumptions can be relaxed by the addition of helper functions.
+ *
+ * This code is based heavily on the PowerPC implementation by Nick Piggin.
+ */
+#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks. For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+/*
+ * We require that the PTE can be read atomically.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ return ptep_get(ptep);
+}
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+
+static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+ unsigned int flags,
+ struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ if (flags & FOLL_PIN)
+ unpin_user_page(page);
+ else
+ put_page(page);
+ }
+}
+
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ struct dev_pagemap *pgmap = NULL;
+ int nr_start = *nr, ret = 0;
+ pte_t *ptep, *ptem;
+
+ ptem = ptep = pte_offset_map(&pmd, addr);
+ do {
+ pte_t pte = gup_get_pte(ptep);
+ struct page *head, *page;
+
+ /*
+ * Similar to the PMD case below, NUMA hinting must take slow
+ * path using the pte_protnone check.
+ */
+ if (pte_protnone(pte))
+ goto pte_unmap;
+
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
+ goto pte_unmap;
+
+ if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ goto pte_unmap;
+ }
+ } else if (pte_special(pte))
+ goto pte_unmap;
+
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+
+ head = try_grab_compound_head(page, 1, flags);
+ if (!head)
+ goto pte_unmap;
+
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_compound_head(head, 1, flags);
+ goto pte_unmap;
+ }
+
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+
+ /*
+ * We need to make the page accessible if and only if we are
+ * going to access its content (the FOLL_PIN case). Please
+ * see Documentation/core-api/pin_user_pages.rst for
+ * details.
+ */
+ if (flags & FOLL_PIN) {
+ ret = arch_make_page_accessible(page);
+ if (ret) {
+ unpin_user_page(page);
+ goto pte_unmap;
+ }
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ ret = 1;
+
+pte_unmap:
+ if (pgmap)
+ put_dev_pagemap(pgmap);
+ pte_unmap(ptem);
+ return ret;
+}
+#else
+
+/*
+ * If we can't determine whether or not a pte is special, then fail immediately
+ * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
+ * to be special.
+ *
+ * For a futex to be placed on a THP tail page, get_futex_key requires a
+ * get_user_pages_fast_only implementation that can pin pages. Thus it's still
+ * useful to have gup_huge_pmd even if we can't operate on ptes.
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
+
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ struct dev_pagemap *pgmap = NULL;
+
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ if (unlikely(!try_grab_page(page, flags))) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ return 0;
+ }
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (pgmap)
+ put_dev_pagemap(pgmap);
+ return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+ int nr_start = *nr;
+
+ fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
+ return 0;
+
+ if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ return 0;
+ }
+ return 1;
+}
+
+static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+ int nr_start = *nr;
+
+ fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
+ return 0;
+
+ if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ return 0;
+ }
+ return 1;
+}
+#else
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
+static int record_subpages(struct page *page, unsigned long addr,
+ unsigned long end, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
+ pages[nr++] = page++;
+
+ return nr;
+}
+
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+ unsigned long sz)
+{
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
+ return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = huge_ptep_get(ptep);
+
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
+ return 0;
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ head = pte_page(pte);
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ refs = record_subpages(page, addr, end, pages + *nr);
+
+ head = try_grab_compound_head(head, refs, flags);
+ if (!head)
+ return 0;
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_compound_head(head, refs, flags);
+ return 0;
+ }
+
+ *nr += refs;
+ SetPageReferenced(head);
+ return 1;
+}
+
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ pte_t *ptep;
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
+ unsigned long next;
+
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ do {
+ next = hugepte_addr_end(addr, end, sz);
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
+ return 0;
+ } while (ptep++, addr = next, addr != end);
+
+ return 1;
+}
+#else
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
+static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ int refs;
+
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
+ return 0;
+
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
+ return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
+ pages, nr);
+ }
+
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ refs = record_subpages(page, addr, end, pages + *nr);
+
+ head = try_grab_compound_head(pmd_page(orig), refs, flags);
+ if (!head)
+ return 0;
+
+ if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ put_compound_head(head, refs, flags);
+ return 0;
+ }
+
+ *nr += refs;
+ SetPageReferenced(head);
+ return 1;
+}
+
+static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ int refs;
+
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
+ return 0;
+
+ if (pud_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
+ return __gup_device_huge_pud(orig, pudp, addr, end, flags,
+ pages, nr);
+ }
+
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ refs = record_subpages(page, addr, end, pages + *nr);
+
+ head = try_grab_compound_head(pud_page(orig), refs, flags);
+ if (!head)
+ return 0;
+
+ if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ put_compound_head(head, refs, flags);
+ return 0;
+ }
+
+ *nr += refs;
+ SetPageReferenced(head);
+ return 1;
+}
+
+static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ int refs;
+ struct page *head, *page;
+
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
+ return 0;
+
+ BUILD_BUG_ON(pgd_devmap(orig));
+
+ page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ refs = record_subpages(page, addr, end, pages + *nr);
+
+ head = try_grab_compound_head(pgd_page(orig), refs, flags);
+ if (!head)
+ return 0;
+
+ if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
+ put_compound_head(head, refs, flags);
+ return 0;
+ }
+
+ *nr += refs;
+ SetPageReferenced(head);
+ return 1;
+}
+
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ do {
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ next = pmd_addr_end(addr, end);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_protnone(pmd))
+ return 0;
+
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
+ pages, nr))
+ return 0;
+
+ } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
+ /*
+ * architecture have different format for hugetlbfs
+ * pmd format and THP pmd format
+ */
+ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
+ PMD_SHIFT, next, flags, pages, nr))
+ return 0;
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ do {
+ pud_t pud = READ_ONCE(*pudp);
+
+ next = pud_addr_end(addr, end);
+ if (unlikely(!pud_present(pud)))
+ return 0;
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
+ pages, nr))
+ return 0;
+ } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
+ if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
+ PUD_SHIFT, next, flags, pages, nr))
+ return 0;
+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+ unsigned long next;
+ p4d_t *p4dp;
+
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ do {
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(p4d))
+ return 0;
+ BUILD_BUG_ON(p4d_huge(p4d));
+ if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
+ if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
+ P4D_SHIFT, next, flags, pages, nr))
+ return 0;
+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
+ return 0;
+ } while (p4dp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static void gup_pgd_range(unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pgd_t *pgdp;
+
+ pgdp = pgd_offset(current->mm, addr);
+ do {
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ return;
+ if (unlikely(pgd_huge(pgd))) {
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
+ pages, nr))
+ return;
+ } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+ if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
+ PGDIR_SHIFT, next, flags, pages, nr))
+ return;
+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
+ return;
+ } while (pgdp++, addr = next, addr != end);
+}
+#else
+static inline void gup_pgd_range(unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+}
+#endif /* CONFIG_HAVE_FAST_GUP */
+
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use get_user_pages_fast_only() for the range, or
+ * we need to fall back to the slow version:
+ */
+static bool gup_fast_permitted(unsigned long start, unsigned long end)
+{
+ return true;
+}
+#endif
+
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int ret;
+
+ /*
+ * FIXME: FOLL_LONGTERM does not work with
+ * get_user_pages_unlocked() (see comments in that function)
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ mmap_read_lock(current->mm);
+ ret = __gup_longterm_locked(current->mm,
+ start, nr_pages,
+ pages, NULL, gup_flags);
+ mmap_read_unlock(current->mm);
+ } else {
+ ret = get_user_pages_unlocked(start, nr_pages,
+ pages, gup_flags);
+ }
+
+ return ret;
+}
+
+static unsigned long lockless_pages_from_mm(unsigned long start,
+ unsigned long end,
+ unsigned int gup_flags,
+ struct page **pages)
+{
+ unsigned long flags;
+ int nr_pinned = 0;
+ unsigned seq;
+
+ if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
+ !gup_fast_permitted(start, end))
+ return 0;
+
+ if (gup_flags & FOLL_PIN) {
+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
+ if (seq & 1)
+ return 0;
+ }
+
+ /*
+ * Disable interrupts. The nested form is used, in order to allow full,
+ * general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being freed
+ * from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock() here as we also want to block IPIs
+ * that come from THPs splitting.
+ */
+ local_irq_save(flags);
+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
+
+ /*
+ * When pinning pages for DMA there could be a concurrent write protect
+ * from fork() via copy_page_range(), in this case always fail fast GUP.
+ */
+ if (gup_flags & FOLL_PIN) {
+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
+ unpin_user_pages(pages, nr_pinned);
+ return 0;
+ }
+ }
+ return nr_pinned;
+}
+
+static int internal_get_user_pages_fast(unsigned long start,
+ unsigned long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages)
+{
+ unsigned long len, end;
+ unsigned long nr_pinned;
+ int ret;
+
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+ FOLL_FAST_ONLY)))
+ return -EINVAL;
+
+ if (gup_flags & FOLL_PIN)
+ atomic_set(&current->mm->has_pinned, 1);
+
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ might_lock_read(&current->mm->mmap_lock);
+
+ start = untagged_addr(start) & PAGE_MASK;
+ len = nr_pages << PAGE_SHIFT;
+ if (check_add_overflow(start, len, &end))
+ return 0;
+ if (unlikely(!access_ok((void __user *)start, len)))
+ return -EFAULT;
+
+ nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
+ if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
+ return nr_pinned;
+
+ /* Slow path: try to get the remaining pages with get_user_pages */
+ start += nr_pinned << PAGE_SHIFT;
+ pages += nr_pinned;
+ ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
+ pages);
+ if (ret < 0) {
+ /*
+ * The caller has to unpin the pages we already pinned so
+ * returning -errno is not an option
+ */
+ if (nr_pinned)
+ return nr_pinned;
+ return ret;
+ }
+ return ret + nr_pinned;
+}
+
+/**
+ * get_user_pages_fast_only() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
+ */
+int get_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ *
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
+
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+
+ /*
+ * As specified in the API description above, this routine is not
+ * allowed to return negative values. However, the common core
+ * routine internal_get_user_pages_fast() *can* return -errno.
+ * Therefore, correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_lock.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number requested.
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
+ * -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ if (!is_valid_gup_flags(gup_flags))
+ return -EINVAL;
+
+ /*
+ * The caller may or may not have explicitly set FOLL_GET; either way is
+ * OK. However, internally (within mm/gup.c), gup fast variants must set
+ * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
+ * request.
+ */
+ gup_flags |= FOLL_GET;
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/**
+ * pin_user_pages_fast() - pin user pages in memory without taking locks
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
+ * get_user_pages_fast() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/core-api/pin_user_pages.rst for further details.
+ */
+int pin_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+
+/*
+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
+ * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ *
+ * The API rules are the same, too: no negative values may be returned.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+
+ /*
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
+ * rules require returning 0, rather than -errno:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return 0;
+ /*
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * This routine is not allowed to return negative values. However,
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
+ * correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
+/**
+ * pin_user_pages_remote() - pin pages of a remote process
+ *
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
+ * get_user_pages_remote() for documentation on the function arguments, because
+ * the arguments here are identical.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/core-api/pin_user_pages.rst for details.
+ */
+long pin_user_pages_remote(struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
+ pages, vmas, locked);
+}
+EXPORT_SYMBOL(pin_user_pages_remote);
+
+/**
+ * pin_user_pages() - pin user pages in memory for use by other devices
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
+ * FOLL_PIN is set.
+ *
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
+ * see Documentation/core-api/pin_user_pages.rst for details.
+ */
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __gup_longterm_locked(current->mm, start, nr_pages,
+ pages, vmas, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages);
+
+/*
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
+ * FOLL_PIN and rejects FOLL_GET.
+ */
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages_unlocked);
+
+/*
+ * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
+ * Behavior is the same, except that this one sets FOLL_PIN and rejects
+ * FOLL_GET.
+ */
+long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked)
+{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __get_user_pages_locked(current->mm, start, nr_pages,
+ pages, NULL, locked,
+ gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
new file mode 100644
index 000000000..8b3e5b5cd
--- /dev/null
+++ b/mm/gup_benchmark.c
@@ -0,0 +1,210 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/debugfs.h>
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+
+struct gup_benchmark {
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+ __u64 expansion[10]; /* For future use */
+};
+
+static void put_back_pages(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_BENCHMARK:
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+ break;
+
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
+ unpin_user_pages(pages, nr_pages);
+ break;
+ }
+}
+
+static void verify_dma_pinned(unsigned int cmd, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ struct page *page;
+
+ switch (cmd) {
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
+ for (i = 0; i < nr_pages; i++) {
+ page = pages[i];
+ if (WARN(!page_maybe_dma_pinned(page),
+ "pages[%lu] is NOT dma-pinned\n", i)) {
+
+ dump_page(page, "gup_benchmark failure");
+ break;
+ }
+ }
+ break;
+ }
+}
+
+static int __gup_benchmark_ioctl(unsigned int cmd,
+ struct gup_benchmark *gup)
+{
+ ktime_t start_time, end_time;
+ unsigned long i, nr_pages, addr, next;
+ int nr;
+ struct page **pages;
+ int ret = 0;
+ bool needs_mmap_lock =
+ cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
+
+ if (gup->size > ULONG_MAX)
+ return -EINVAL;
+
+ nr_pages = gup->size / PAGE_SIZE;
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
+ ret = -EINTR;
+ goto free_pages;
+ }
+
+ i = 0;
+ nr = gup->nr_pages_per_call;
+ start_time = ktime_get();
+ for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) {
+ if (nr != gup->nr_pages_per_call)
+ break;
+
+ next = addr + nr * PAGE_SIZE;
+ if (next > gup->addr + gup->size) {
+ next = gup->addr + gup->size;
+ nr = (next - addr) / PAGE_SIZE;
+ }
+
+ /* Filter out most gup flags: only allow a tiny subset here: */
+ gup->flags &= FOLL_WRITE;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ nr = get_user_pages_fast(addr, nr, gup->flags,
+ pages + i);
+ break;
+ case GUP_BENCHMARK:
+ nr = get_user_pages(addr, nr, gup->flags, pages + i,
+ NULL);
+ break;
+ case PIN_FAST_BENCHMARK:
+ nr = pin_user_pages_fast(addr, nr, gup->flags,
+ pages + i);
+ break;
+ case PIN_BENCHMARK:
+ nr = pin_user_pages(addr, nr, gup->flags, pages + i,
+ NULL);
+ break;
+ case PIN_LONGTERM_BENCHMARK:
+ nr = pin_user_pages(addr, nr,
+ gup->flags | FOLL_LONGTERM,
+ pages + i, NULL);
+ break;
+ default:
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (nr <= 0)
+ break;
+ i += nr;
+ }
+ end_time = ktime_get();
+
+ /* Shifting the meaning of nr_pages: now it is actual number pinned: */
+ nr_pages = i;
+
+ gup->get_delta_usec = ktime_us_delta(end_time, start_time);
+ gup->size = addr - gup->addr;
+
+ /*
+ * Take an un-benchmark-timed moment to verify DMA pinned
+ * state: print a warning if any non-dma-pinned pages are found:
+ */
+ verify_dma_pinned(cmd, pages, nr_pages);
+
+ start_time = ktime_get();
+
+ put_back_pages(cmd, pages, nr_pages);
+
+ end_time = ktime_get();
+ gup->put_delta_usec = ktime_us_delta(end_time, start_time);
+
+unlock:
+ if (needs_mmap_lock)
+ mmap_read_unlock(current->mm);
+free_pages:
+ kvfree(pages);
+ return ret;
+}
+
+static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gup_benchmark gup;
+ int ret;
+
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_BENCHMARK:
+ case PIN_FAST_BENCHMARK:
+ case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
+ return -EFAULT;
+
+ ret = __gup_benchmark_ioctl(cmd, &gup);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &gup, sizeof(gup)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static const struct file_operations gup_benchmark_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = gup_benchmark_ioctl,
+};
+
+static int gup_benchmark_init(void)
+{
+ debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+ &gup_benchmark_fops);
+
+ return 0;
+}
+
+late_initcall(gup_benchmark_init);
diff --git a/mm/highmem.c b/mm/highmem.c
new file mode 100644
index 000000000..1352a2795
--- /dev/null
+++ b/mm/highmem.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/kgdb.h>
+#include <asm/tlbflush.h>
+#include <linux/vmalloc.h>
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+#ifdef CONFIG_HIGHMEM
+
+/*
+ * Architecture with aliasing data cache may define the following family of
+ * helper functions in its asm/highmem.h to control cache color of virtual
+ * addresses where physical memory pages are mapped by kmap.
+ */
+#ifndef get_pkmap_color
+
+/*
+ * Determine color of virtual address where the page should be mapped.
+ */
+static inline unsigned int get_pkmap_color(struct page *page)
+{
+ return 0;
+}
+#define get_pkmap_color get_pkmap_color
+
+/*
+ * Get next index for mapping inside PKMAP region for page with given color.
+ */
+static inline unsigned int get_next_pkmap_nr(unsigned int color)
+{
+ static unsigned int last_pkmap_nr;
+
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ return last_pkmap_nr;
+}
+
+/*
+ * Determine if page index inside PKMAP region (pkmap_nr) of given color
+ * has wrapped around PKMAP region end. When this happens an attempt to
+ * flush all unused PKMAP slots is made.
+ */
+static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color)
+{
+ return pkmap_nr == 0;
+}
+
+/*
+ * Get the number of PKMAP entries of the given color. If no free slot is
+ * found after checking that many entries, kmap will sleep waiting for
+ * someone to call kunmap and free PKMAP slot.
+ */
+static inline int get_pkmap_entries_count(unsigned int color)
+{
+ return LAST_PKMAP;
+}
+
+/*
+ * Get head of a wait queue for PKMAP entries of the given color.
+ * Wait queues for different mapping colors should be independent to avoid
+ * unnecessary wakeups caused by freeing of slots of other colors.
+ */
+static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
+{
+ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+ return &pkmap_map_wait;
+}
+#endif
+
+atomic_long_t _totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(_totalhigh_pages);
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
+unsigned int nr_free_highpages (void)
+{
+ struct zone *zone;
+ unsigned int pages = 0;
+
+ for_each_populated_zone(zone) {
+ if (is_highmem(zone))
+ pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ return pages;
+}
+
+static int pkmap_count[LAST_PKMAP];
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+
+pte_t * pkmap_page_table;
+
+/*
+ * Most architectures have no use for kmap_high_get(), so let's abstract
+ * the disabling of IRQ out of the locking in that case to save on a
+ * potential useless overhead.
+ */
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+#define lock_kmap() spin_lock_irq(&kmap_lock)
+#define unlock_kmap() spin_unlock_irq(&kmap_lock)
+#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
+#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
+#else
+#define lock_kmap() spin_lock(&kmap_lock)
+#define unlock_kmap() spin_unlock(&kmap_lock)
+#define lock_kmap_any(flags) \
+ do { spin_lock(&kmap_lock); (void)(flags); } while (0)
+#define unlock_kmap_any(flags) \
+ do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
+#endif
+
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
+ int i = PKMAP_NR(addr);
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+EXPORT_SYMBOL(kmap_to_page);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+ int need_flush = 0;
+
+ flush_cache_kmaps();
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+
+ /* sanity check */
+ BUG_ON(pte_none(pkmap_page_table[i]));
+
+ /*
+ * Don't need an atomic fetch-and-clear op here;
+ * no-one has the page mapped, and cannot get at
+ * its virtual address (and hence PTE) without first
+ * getting the kmap_lock (which is held here).
+ * So no dangers, even with speculative execution.
+ */
+ page = pte_page(pkmap_page_table[i]);
+ pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
+
+ set_page_address(page, NULL);
+ need_flush = 1;
+ }
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+/**
+ * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+ */
+void kmap_flush_unused(void)
+{
+ lock_kmap();
+ flush_all_zero_pkmaps();
+ unlock_kmap();
+}
+
+static inline unsigned long map_new_virtual(struct page *page)
+{
+ unsigned long vaddr;
+ int count;
+ unsigned int last_pkmap_nr;
+ unsigned int color = get_pkmap_color(page);
+
+start:
+ count = get_pkmap_entries_count(color);
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = get_next_pkmap_nr(color);
+ if (no_more_pkmaps(last_pkmap_nr, color)) {
+ flush_all_zero_pkmaps();
+ count = get_pkmap_entries_count(color);
+ }
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *pkmap_map_wait =
+ get_pkmap_wait_queue_head(color);
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(pkmap_map_wait, &wait);
+ unlock_kmap();
+ schedule();
+ remove_wait_queue(pkmap_map_wait, &wait);
+ lock_kmap();
+
+ /* Somebody else might have mapped it while we slept */
+ if (page_address(page))
+ return (unsigned long)page_address(page);
+
+ /* Re-start */
+ goto start;
+ }
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ set_pte_at(&init_mm, vaddr,
+ &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+
+ pkmap_count[last_pkmap_nr] = 1;
+ set_page_address(page, (void *)vaddr);
+
+ return vaddr;
+}
+
+/**
+ * kmap_high - map a highmem page into memory
+ * @page: &struct page to map
+ *
+ * Returns the page's virtual memory address.
+ *
+ * We cannot call this from interrupts, as it may block.
+ */
+void *kmap_high(struct page *page)
+{
+ unsigned long vaddr;
+
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ */
+ lock_kmap();
+ vaddr = (unsigned long)page_address(page);
+ if (!vaddr)
+ vaddr = map_new_virtual(page);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
+ unlock_kmap();
+ return (void*) vaddr;
+}
+
+EXPORT_SYMBOL(kmap_high);
+
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+/**
+ * kmap_high_get - pin a highmem page into memory
+ * @page: &struct page to pin
+ *
+ * Returns the page's current virtual memory address, or NULL if no mapping
+ * exists. If and only if a non null address is returned then a
+ * matching call to kunmap_high() is necessary.
+ *
+ * This can be called from any context.
+ */
+void *kmap_high_get(struct page *page)
+{
+ unsigned long vaddr, flags;
+
+ lock_kmap_any(flags);
+ vaddr = (unsigned long)page_address(page);
+ if (vaddr) {
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ }
+ unlock_kmap_any(flags);
+ return (void*) vaddr;
+}
+#endif
+
+/**
+ * kunmap_high - unmap a highmem page into memory
+ * @page: &struct page to unmap
+ *
+ * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
+ * only from user context.
+ */
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+ unsigned long flags;
+ int need_wakeup;
+ unsigned int color = get_pkmap_color(page);
+ wait_queue_head_t *pkmap_map_wait;
+
+ lock_kmap_any(flags);
+ vaddr = (unsigned long)page_address(page);
+ BUG_ON(!vaddr);
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ need_wakeup = 0;
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ /*
+ * Avoid an unnecessary wake_up() function call.
+ * The common case is pkmap_count[] == 1, but
+ * no waiters.
+ * The tasks queued in the wait-queue are guarded
+ * by both the lock in the wait-queue-head and by
+ * the kmap_lock. As the kmap_lock is held here,
+ * no need for the wait-queue-head's lock. Simply
+ * test if the queue is empty.
+ */
+ pkmap_map_wait = get_pkmap_wait_queue_head(color);
+ need_wakeup = waitqueue_active(pkmap_map_wait);
+ }
+ unlock_kmap_any(flags);
+
+ /* do wake-up, if needed, race-free outside of the spin lock */
+ if (need_wakeup)
+ wake_up(pkmap_map_wait);
+}
+
+EXPORT_SYMBOL(kunmap_high);
+#endif /* CONFIG_HIGHMEM */
+
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER 7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+ struct page *page;
+ void *virtual;
+ struct list_head list;
+};
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+ struct list_head lh; /* List of page_address_maps */
+ spinlock_t lock; /* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(const struct page *page)
+{
+ return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+/**
+ * page_address - get the mapped virtual address of a page
+ * @page: &struct page to get the virtual address of
+ *
+ * Returns the page's virtual address.
+ */
+void *page_address(const struct page *page)
+{
+ unsigned long flags;
+ void *ret;
+ struct page_address_slot *pas;
+
+ if (!PageHighMem(page))
+ return lowmem_page_address(page);
+
+ pas = page_slot(page);
+ ret = NULL;
+ spin_lock_irqsave(&pas->lock, flags);
+ if (!list_empty(&pas->lh)) {
+ struct page_address_map *pam;
+
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ ret = pam->virtual;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock_irqrestore(&pas->lock, flags);
+ return ret;
+}
+
+EXPORT_SYMBOL(page_address);
+
+/**
+ * set_page_address - set a page's virtual address
+ * @page: &struct page to set
+ * @virtual: virtual address to use
+ */
+void set_page_address(struct page *page, void *virtual)
+{
+ unsigned long flags;
+ struct page_address_slot *pas;
+ struct page_address_map *pam;
+
+ BUG_ON(!PageHighMem(page));
+
+ pas = page_slot(page);
+ if (virtual) { /* Add */
+ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
+ pam->page = page;
+ pam->virtual = virtual;
+
+ spin_lock_irqsave(&pas->lock, flags);
+ list_add_tail(&pam->list, &pas->lh);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ } else { /* Remove */
+ spin_lock_irqsave(&pas->lock, flags);
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ goto done;
+ }
+ }
+ spin_unlock_irqrestore(&pas->lock, flags);
+ }
+done:
+ return;
+}
+
+void __init page_address_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+ INIT_LIST_HEAD(&page_address_htable[i].lh);
+ spin_lock_init(&page_address_htable[i].lock);
+ }
+}
+
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 000000000..cbe9d0c66
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Refer to include/linux/hmm.h for information about heterogeneous memory
+ * management or HMM for short.
+ */
+#include <linux/pagewalk.h>
+#include <linux/hmm.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memremap.h>
+#include <linux/sched/mm.h>
+#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
+#include <linux/mmu_notifier.h>
+#include <linux/memory_hotplug.h>
+
+struct hmm_vma_walk {
+ struct hmm_range *range;
+ unsigned long last;
+};
+
+enum {
+ HMM_NEED_FAULT = 1 << 0,
+ HMM_NEED_WRITE_FAULT = 1 << 1,
+ HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
+};
+
+static int hmm_pfns_fill(unsigned long addr, unsigned long end,
+ struct hmm_range *range, unsigned long cpu_flags)
+{
+ unsigned long i = (addr - range->start) >> PAGE_SHIFT;
+
+ for (; addr < end; addr += PAGE_SIZE, i++)
+ range->hmm_pfns[i] = cpu_flags;
+ return 0;
+}
+
+/*
+ * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
+ * @addr: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @required_fault: HMM_NEED_* flags
+ * @walk: mm_walk structure
+ * Return: -EBUSY after page fault, or page fault error
+ *
+ * This function will be called whenever pmd_none() or pte_none() returns true,
+ * or whenever there is no page directory covering the virtual address range.
+ */
+static int hmm_vma_fault(unsigned long addr, unsigned long end,
+ unsigned int required_fault, struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned int fault_flags = FAULT_FLAG_REMOTE;
+
+ WARN_ON_ONCE(!required_fault);
+ hmm_vma_walk->last = addr;
+
+ if (required_fault & HMM_NEED_WRITE_FAULT) {
+ if (!(vma->vm_flags & VM_WRITE))
+ return -EPERM;
+ fault_flags |= FAULT_FLAG_WRITE;
+ }
+
+ for (; addr < end; addr += PAGE_SIZE)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
+ return -EFAULT;
+ return -EBUSY;
+}
+
+static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ unsigned long pfn_req_flags,
+ unsigned long cpu_flags)
+{
+ struct hmm_range *range = hmm_vma_walk->range;
+
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be used 2 ways. The first one where the HMM user coalesces
+ * multiple page faults into one request and sets flags per pfn for
+ * those faults. The second one where the HMM user wants to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfn_req_flags &= range->pfn_flags_mask;
+ pfn_req_flags |= range->default_flags;
+
+ /* We aren't ask to do anything ... */
+ if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
+ return 0;
+
+ /* Need to write fault ? */
+ if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
+ !(cpu_flags & HMM_PFN_WRITE))
+ return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
+
+ /* If CPU page table is not valid then we need to fault */
+ if (!(cpu_flags & HMM_PFN_VALID))
+ return HMM_NEED_FAULT;
+ return 0;
+}
+
+static unsigned int
+hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ const unsigned long hmm_pfns[], unsigned long npages,
+ unsigned long cpu_flags)
+{
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned int required_fault = 0;
+ unsigned long i;
+
+ /*
+ * If the default flags do not request to fault pages, and the mask does
+ * not allow for individual pages to be faulted, then
+ * hmm_pte_need_fault() will always return 0.
+ */
+ if (!((range->default_flags | range->pfn_flags_mask) &
+ HMM_PFN_REQ_FAULT))
+ return 0;
+
+ for (i = 0; i < npages; ++i) {
+ required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
+ cpu_flags);
+ if (required_fault == HMM_NEED_ALL_BITS)
+ return required_fault;
+ }
+ return required_fault;
+}
+
+static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
+ __always_unused int depth, struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned int required_fault;
+ unsigned long i, npages;
+ unsigned long *hmm_pfns;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ hmm_pfns = &range->hmm_pfns[i];
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
+ if (!walk->vma) {
+ if (required_fault)
+ return -EFAULT;
+ return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
+ }
+ if (required_fault)
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ return hmm_pfns_fill(addr, end, range, 0);
+}
+
+static inline unsigned long hmm_pfn_flags_order(unsigned long order)
+{
+ return order << HMM_PFN_ORDER_SHIFT;
+}
+
+static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
+ pmd_t pmd)
+{
+ if (pmd_protnone(pmd))
+ return 0;
+ return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, unsigned long hmm_pfns[],
+ pmd_t pmd)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long pfn, npages, i;
+ unsigned int required_fault;
+ unsigned long cpu_flags;
+
+ npages = (end - addr) >> PAGE_SHIFT;
+ cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
+ if (required_fault)
+ return hmm_vma_fault(addr, end, required_fault, walk);
+
+ pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ hmm_pfns[i] = pfn | cpu_flags;
+ return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+/* stub to allow the code below to compile */
+int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline bool hmm_is_device_private_entry(struct hmm_range *range,
+ swp_entry_t entry)
+{
+ return is_device_private_entry(entry) &&
+ device_private_entry_to_page(entry)->pgmap->owner ==
+ range->dev_private_owner;
+}
+
+static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
+ pte_t pte)
+{
+ if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
+ return 0;
+ return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
+}
+
+static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, pmd_t *pmdp, pte_t *ptep,
+ unsigned long *hmm_pfn)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned int required_fault;
+ unsigned long cpu_flags;
+ pte_t pte = *ptep;
+ uint64_t pfn_req_flags = *hmm_pfn;
+
+ if (pte_none(pte)) {
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
+ if (required_fault)
+ goto fault;
+ *hmm_pfn = 0;
+ return 0;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ /*
+ * Never fault in device private pages, but just report
+ * the PFN even if not present.
+ */
+ if (hmm_is_device_private_entry(range, entry)) {
+ cpu_flags = HMM_PFN_VALID;
+ if (is_write_device_private_entry(entry))
+ cpu_flags |= HMM_PFN_WRITE;
+ *hmm_pfn = device_private_entry_to_pfn(entry) |
+ cpu_flags;
+ return 0;
+ }
+
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
+ if (!required_fault) {
+ *hmm_pfn = 0;
+ return 0;
+ }
+
+ if (!non_swap_entry(entry))
+ goto fault;
+
+ if (is_migration_entry(entry)) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(walk->mm, pmdp, addr);
+ return -EBUSY;
+ }
+
+ /* Report error for everything else */
+ pte_unmap(ptep);
+ return -EFAULT;
+ }
+
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
+ if (required_fault)
+ goto fault;
+
+ /*
+ * Bypass devmap pte such as DAX page when all pfn requested
+ * flags(pfn_req_flags) are fulfilled.
+ * Since each architecture defines a struct page for the zero page, just
+ * fall through and treat it like a normal page.
+ */
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
+ !is_zero_pfn(pte_pfn(pte))) {
+ if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
+ pte_unmap(ptep);
+ return -EFAULT;
+ }
+ *hmm_pfn = HMM_PFN_ERROR;
+ return 0;
+ }
+
+ *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ return 0;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault any virtual address we were asked to fault */
+ return hmm_vma_fault(addr, end, required_fault, walk);
+}
+
+static int hmm_vma_walk_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long *hmm_pfns =
+ &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+ unsigned long addr = start;
+ pte_t *ptep;
+ pmd_t pmd;
+
+again:
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ return hmm_vma_walk_hole(start, end, -1, walk);
+
+ if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(walk->mm, pmdp);
+ return -EBUSY;
+ }
+ return hmm_pfns_fill(start, end, range, 0);
+ }
+
+ if (!pmd_present(pmd)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
+ return -EFAULT;
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ }
+
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
+ /*
+ * No need to take pmd_lock here, even if some other thread
+ * is splitting the huge pmd we will get that event through
+ * mmu_notifier callback.
+ *
+ * So just read pmd value and check again it's a transparent
+ * huge or device mapping one and compute corresponding pfn
+ * values.
+ */
+ pmd = pmd_read_atomic(pmdp);
+ barrier();
+ if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ goto again;
+
+ return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
+ }
+
+ /*
+ * We have handled all the valid cases above ie either none, migration,
+ * huge or transparent huge. At this point either it is a valid pmd
+ * entry pointing to pte directory or it is a bad pmd that will not
+ * recover.
+ */
+ if (pmd_bad(pmd)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
+ return -EFAULT;
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ }
+
+ ptep = pte_offset_map(pmdp, addr);
+ for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
+ int r;
+
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
+ if (r) {
+ /* hmm_vma_handle_pte() did pte_unmap() */
+ return r;
+ }
+ }
+ pte_unmap(ptep - 1);
+ return 0;
+}
+
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
+ pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
+}
+
+static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start;
+ pud_t pud;
+ int ret = 0;
+ spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+
+ if (!ptl)
+ return 0;
+
+ /* Normally we don't want to split the huge page */
+ walk->action = ACTION_CONTINUE;
+
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud)) {
+ spin_unlock(ptl);
+ return hmm_vma_walk_hole(start, end, -1, walk);
+ }
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ unsigned int required_fault;
+ unsigned long *hmm_pfns;
+ unsigned long cpu_flags;
+
+ if (!pud_present(pud)) {
+ spin_unlock(ptl);
+ return hmm_vma_walk_hole(start, end, -1, walk);
+ }
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ hmm_pfns = &range->hmm_pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
+ npages, cpu_flags);
+ if (required_fault) {
+ spin_unlock(ptl);
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ }
+
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn)
+ hmm_pfns[i] = pfn | cpu_flags;
+ goto out_unlock;
+ }
+
+ /* Ask for the PUD to be split */
+ walk->action = ACTION_SUBTREE;
+
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+}
+#else
+#define hmm_vma_walk_pud NULL
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ unsigned long addr = start, i, pfn;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned int required_fault;
+ unsigned long pfn_req_flags;
+ unsigned long cpu_flags;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> PAGE_SHIFT;
+ pfn_req_flags = range->hmm_pfns[i];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
+ hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
+ if (required_fault) {
+ spin_unlock(ptl);
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ }
+
+ pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ range->hmm_pfns[i] = pfn | cpu_flags;
+
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define hmm_vma_walk_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static int hmm_vma_walk_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
+ vma->vm_flags & VM_READ)
+ return 0;
+
+ /*
+ * vma ranges that don't have struct page backing them or map I/O
+ * devices directly cannot be handled by hmm_range_fault().
+ *
+ * If the vma does not allow read access, then assume that it does not
+ * allow write access either. HMM does not support architectures that
+ * allow write without read.
+ *
+ * If a fault is requested for an unsupported range then it is a hard
+ * failure.
+ */
+ if (hmm_range_need_fault(hmm_vma_walk,
+ range->hmm_pfns +
+ ((start - range->start) >> PAGE_SHIFT),
+ (end - start) >> PAGE_SHIFT, 0))
+ return -EFAULT;
+
+ hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+
+ /* Skip this vma and continue processing the next vma. */
+ return 1;
+}
+
+static const struct mm_walk_ops hmm_walk_ops = {
+ .pud_entry = hmm_vma_walk_pud,
+ .pmd_entry = hmm_vma_walk_pmd,
+ .pte_hole = hmm_vma_walk_hole,
+ .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
+ .test_walk = hmm_vma_walk_test,
+};
+
+/**
+ * hmm_range_fault - try to fault some address in a virtual address range
+ * @range: argument structure
+ *
+ * Returns 0 on success or one of the following error codes:
+ *
+ * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
+ * (e.g., device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (e.g., asking for write and range is read
+ * only).
+ * -EBUSY: The range has been invalidated and the caller needs to wait for
+ * the invalidation to finish.
+ * -EFAULT: A page was requested to be valid and could not be made valid
+ * ie it has no backing VMA or it is illegal to access
+ *
+ * This is similar to get_user_pages(), except that it can read the page tables
+ * without mutating them (ie causing faults).
+ */
+int hmm_range_fault(struct hmm_range *range)
+{
+ struct hmm_vma_walk hmm_vma_walk = {
+ .range = range,
+ .last = range->start,
+ };
+ struct mm_struct *mm = range->notifier->mm;
+ int ret;
+
+ mmap_assert_locked(mm);
+
+ do {
+ /* If range is no longer valid force retry. */
+ if (mmu_interval_check_retry(range->notifier,
+ range->notifier_seq))
+ return -EBUSY;
+ ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
+ &hmm_walk_ops, &hmm_vma_walk);
+ /*
+ * When -EBUSY is returned the loop restarts with
+ * hmm_vma_walk.last set to an address that has not been stored
+ * in pfns. All entries < last in the pfn array are set to their
+ * output, and all >= are still at their input values.
+ */
+ } while (ret == -EBUSY);
+ return ret;
+}
+EXPORT_SYMBOL(hmm_range_fault);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000..e4c690c21
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,3015 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/shrinker.h>
+#include <linux/mm_inline.h>
+#include <linux/swapops.h>
+#include <linux/dax.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/pfn_t.h>
+#include <linux/mman.h>
+#include <linux/memremap.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/migrate.h>
+#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+#include <linux/page_owner.h>
+
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default, transparent hugepage support is disabled in order to avoid
+ * risking an increased memory footprint for applications that are not
+ * guaranteed to benefit from it. When transparent hugepage support is
+ * enabled, it is for all mappings, and khugepaged scans all mappings.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+
+static struct shrinker deferred_split_shrinker;
+
+static atomic_t huge_zero_refcount;
+struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
+
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
+{
+ /* The addr is used to check if the vma size fits */
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+
+ if (!transhuge_vma_suitable(vma, addr))
+ return false;
+ if (vma_is_anonymous(vma))
+ return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma))
+ return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
+
+ return false;
+}
+
+static struct page *get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return READ_ONCE(huge_zero_page);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return NULL;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
+ preempt_enable();
+ __free_pages(zero_page, compound_order(zero_page));
+ goto retry;
+ }
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return READ_ONCE(huge_zero_page);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ return READ_ONCE(huge_zero_page);
+
+ if (!get_huge_zero_page())
+ return NULL;
+
+ if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+
+ return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+ if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ put_huge_zero_page();
+}
+
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ struct page *zero_page = xchg(&huge_zero_page, NULL);
+ BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
+ __free_pages(zero_page, compound_order(zero_page));
+ return HPAGE_PMD_NR;
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret = count;
+
+ if (sysfs_streq(buf, "always")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ ret = -EINVAL;
+
+ if (ret > 0) {
+ int err = start_stop_khugepaged();
+ if (err)
+ ret = err;
+ }
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+ssize_t single_hugepage_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
+}
+
+ssize_t single_hugepage_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
+ set_bit(flag, &transparent_hugepage_flags);
+ else
+ clear_bit(flag, &transparent_hugepage_flags);
+
+ return count;
+}
+
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] defer defer+madvise madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [defer] defer+madvise madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [defer+madvise] madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer defer+madvise [madvise] never\n");
+ return sprintf(buf, "always defer defer+madvise madvise [never]\n");
+}
+
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (sysfs_streq(buf, "always")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "defer+madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_hugepage_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_hugepage_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+
+static ssize_t hpage_pmd_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+}
+static struct kobj_attribute hpage_pmd_size_attr =
+ __ATTR_RO(hpage_pmd_size);
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+ &use_zero_page_attr.attr,
+ &hpage_pmd_size_attr.attr,
+#ifdef CONFIG_SHMEM
+ &shmem_enabled_attr.attr,
+#endif
+ NULL,
+};
+
+static const struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ int err;
+
+ *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!*hugepage_kobj)) {
+ pr_err("failed to create transparent hugepage kobject\n");
+ return -ENOMEM;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ pr_err("failed to register transparent hugepage group\n");
+ goto delete_obj;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ pr_err("failed to register transparent hugepage group\n");
+ goto remove_hp_group;
+ }
+
+ return 0;
+
+remove_hp_group:
+ sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
+delete_obj:
+ kobject_put(*hugepage_kobj);
+ return err;
+}
+
+static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+ sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
+ sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
+ kobject_put(hugepage_kobj);
+}
+#else
+static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ return 0;
+}
+
+static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+ struct kobject *hugepage_kobj;
+
+ if (!has_transparent_hugepage()) {
+ /*
+ * Hardware doesn't support hugepages, hence disable
+ * DAX PMD support.
+ */
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
+ return -EINVAL;
+ }
+
+ /*
+ * hugepages can't be allocated by the buddy allocator
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ /*
+ * we use page->mapping and page->index in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ err = hugepage_init_sysfs(&hugepage_kobj);
+ if (err)
+ goto err_sysfs;
+
+ err = khugepaged_init();
+ if (err)
+ goto err_slab;
+
+ err = register_shrinker(&huge_zero_page_shrinker);
+ if (err)
+ goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ transparent_hugepage_flags = 0;
+ return 0;
+ }
+
+ err = start_stop_khugepaged();
+ if (err)
+ goto err_khugepaged;
+
+ return 0;
+err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
+ unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+ khugepaged_destroy();
+err_slab:
+ hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
+ return err;
+}
+subsys_initcall(hugepage_init);
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
+}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
+}
+#endif
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
+bool is_transparent_hugepage(struct page *page)
+{
+ if (!PageCompound(page))
+ return false;
+
+ page = compound_head(page);
+ return is_huge_zero_page(page) ||
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
+
+static unsigned long __thp_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len,
+ loff_t off, unsigned long flags, unsigned long size)
+{
+ loff_t off_end = off + len;
+ loff_t off_align = round_up(off, size);
+ unsigned long len_pad, ret;
+
+ if (off_end <= off_align || (off_end - off_align) < size)
+ return 0;
+
+ len_pad = len + size;
+ if (len_pad < len || (off + len_pad) < off)
+ return 0;
+
+ ret = current->mm->get_unmapped_area(filp, addr, len_pad,
+ off >> PAGE_SHIFT, flags);
+
+ /*
+ * The failure might be due to length padding. The caller will retry
+ * without the padding.
+ */
+ if (IS_ERR_VALUE(ret))
+ return 0;
+
+ /*
+ * Do not try to align to THP boundary if allocation at the address
+ * hint succeeds.
+ */
+ if (ret == addr)
+ return addr;
+
+ ret += (off - ret) & (size - 1);
+ return ret;
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ unsigned long ret;
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+
+ if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+ goto out;
+
+ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+ if (ret)
+ return ret;
+out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
+ struct page *page, gfp_t gfp)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ pgtable_t pgtable;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ vm_fault_t ret = 0;
+
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+ return VM_FAULT_FALLBACK;
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
+
+ clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * clear_huge_page writes become visible before the set_pmd_at()
+ * write.
+ */
+ __SetPageUptodate(page);
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ goto unlock_release;
+ } else {
+ pmd_t entry;
+
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+
+ /* Deliver the page fault to userland */
+ if (userfaultfd_missing(vma)) {
+ vm_fault_t ret2;
+
+ spin_unlock(vmf->ptl);
+ put_page(page);
+ pte_free(vma->vm_mm, pgtable);
+ ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
+ return ret2;
+ }
+
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(vma->vm_mm);
+ spin_unlock(vmf->ptl);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+ }
+
+ return 0;
+unlock_release:
+ spin_unlock(vmf->ptl);
+release:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ put_page(page);
+ return ret;
+
+}
+
+/*
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ * fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+ const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+ /* Always do synchronous compaction */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+ return GFP_TRANSHUGE_LIGHT;
+}
+
+/* Caller must hold page table lock. */
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ struct page *zero_page)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ if (pgtable)
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, haddr, pmd, entry);
+ mm_inc_nr_ptes(mm);
+ return true;
+}
+
+vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ gfp_t gfp;
+ struct page *page;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+
+ if (!transhuge_vma_suitable(vma, haddr))
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
+ return VM_FAULT_OOM;
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ struct page *zero_page;
+ vm_fault_t ret;
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_page = mm_get_huge_zero_page(vma->vm_mm);
+ if (unlikely(!zero_page)) {
+ pte_free(vma->vm_mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ ret = 0;
+ if (pmd_none(*vmf->pmd)) {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret) {
+ spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
+ } else if (userfaultfd_missing(vma)) {
+ spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ } else {
+ set_huge_zero_page(pgtable, vma->vm_mm, vma,
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
+ }
+ } else {
+ spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
+ }
+ return ret;
+ }
+ gfp = alloc_hugepage_direct_gfpmask(vma);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ prep_transhuge_page(page);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
+}
+
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
+ pgtable_t pgtable)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t entry;
+ spinlock_t *ptl;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_none(*pmd)) {
+ if (write) {
+ if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
+ goto out_unlock;
+ }
+ entry = pmd_mkyoung(*pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+
+ goto out_unlock;
+ }
+
+ entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pmd_mkdevmap(entry);
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+
+ if (pgtable) {
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ mm_inc_nr_ptes(mm);
+ pgtable = NULL;
+ }
+
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+}
+
+/**
+ * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
+{
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ pgtable_t pgtable = NULL;
+
+ /*
+ * If we had pmd_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ !pfn_t_devmap(pfn));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
+ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pud = pud_mkwrite(pud);
+ return pud;
+}
+
+static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t entry;
+ spinlock_t *ptl;
+
+ ptl = pud_lock(mm, pud);
+ if (!pud_none(*pud)) {
+ if (write) {
+ if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+ goto out_unlock;
+ }
+ entry = pud_mkyoung(*pud);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+ update_mmu_cache_pud(vma, addr, pud);
+ }
+ goto out_unlock;
+ }
+
+ entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pud_mkdevmap(entry);
+ if (write) {
+ entry = pud_mkyoung(pud_mkdirty(entry));
+ entry = maybe_pud_mkwrite(entry, vma);
+ }
+ set_pud_at(mm, addr, pud, entry);
+ update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
+ spin_unlock(ptl);
+}
+
+/**
+ * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
+{
+ unsigned long addr = vmf->address & PUD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+
+ /*
+ * If we had pud_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ !pfn_t_devmap(pfn));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags)
+{
+ pmd_t _pmd;
+
+ _pmd = pmd_mkyoung(*pmd);
+ if (flags & FOLL_WRITE)
+ _pmd = pmd_mkdirty(_pmd);
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, flags & FOLL_WRITE))
+ update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
+{
+ unsigned long pfn = pmd_pfn(*pmd);
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ /*
+ * When we COW a devmap PMD entry, we split it into PTEs, so we should
+ * not be in this function with `flags & FOLL_COW` set.
+ */
+ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ return NULL;
+
+ if (pmd_present(*pmd) && pmd_devmap(*pmd))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd, flags);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ */
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
+
+ return page;
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+ spinlock_t *dst_ptl, *src_ptl;
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable = NULL;
+ int ret = -ENOMEM;
+
+ /* Skip if can be re-fill on fault */
+ if (!vma_is_anonymous(dst_vma))
+ return 0;
+
+ pgtable = pte_alloc_one(dst_mm);
+ if (unlikely(!pgtable))
+ goto out;
+
+ dst_ptl = pmd_lock(dst_mm, dst_pmd);
+ src_ptl = pmd_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (unlikely(is_swap_pmd(pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ if (is_write_migration_entry(entry)) {
+ make_migration_entry_read(&entry);
+ pmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ ret = 0;
+ goto out_unlock;
+ }
+#endif
+
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ /*
+ * When page table lock is held, the huge zero pmd should not be
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
+ }
+
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+ /*
+ * If this page is a potentially pinned page, split and retry the fault
+ * with smaller page size. Normally this should not happen because the
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
+ * best effort that the pinned pages won't be replaced by another
+ * random page during the coming copy-on-write.
+ */
+ if (unlikely(is_cow_mapping(src_vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(src_page))) {
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ return -EAGAIN;
+ }
+
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+out:
+ return ret;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, int flags)
+{
+ pud_t _pud;
+
+ _pud = pud_mkyoung(*pud);
+ if (flags & FOLL_WRITE)
+ _pud = pud_mkdirty(_pud);
+ if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
+ pud, _pud, flags & FOLL_WRITE))
+ update_mmu_cache_pud(vma, addr, pud);
+}
+
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
+{
+ unsigned long pfn = pud_pfn(*pud);
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+
+ assert_spin_locked(pud_lockptr(mm, pud));
+
+ if (flags & FOLL_WRITE && !pud_write(*pud))
+ return NULL;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
+ if (pud_present(*pud) && pud_devmap(*pud))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pud(vma, addr, pud, flags);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ *
+ * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
+ */
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
+
+ return page;
+}
+
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ spinlock_t *dst_ptl, *src_ptl;
+ pud_t pud;
+ int ret;
+
+ dst_ptl = pud_lock(dst_mm, dst_pud);
+ src_ptl = pud_lockptr(src_mm, src_pud);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pud = *src_pud;
+ if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ goto out_unlock;
+
+ /*
+ * When page table lock is held, the huge zero pud should not be
+ * under splitting since we don't split the page itself, only pud to
+ * a page table.
+ */
+ if (is_huge_zero_pud(pud)) {
+ /* No huge zero pud yet */
+ }
+
+ /* Please refer to comments in copy_huge_pmd() */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(pud_page(pud)))) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pud(vma, src_pud, addr);
+ return -EAGAIN;
+ }
+
+ pudp_set_wrprotect(src_mm, addr, src_pud);
+ pud = pud_mkold(pud_wrprotect(pud));
+ set_pud_at(dst_mm, addr, dst_pud, pud);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ return ret;
+}
+
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+ pud_t entry;
+ unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+ goto unlock;
+
+ entry = pud_mkyoung(orig_pud);
+ if (write)
+ entry = pud_mkdirty(entry);
+ haddr = vmf->address & HPAGE_PUD_MASK;
+ if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
+ update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
+
+unlock:
+ spin_unlock(vmf->ptl);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+ pmd_t entry;
+ unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+ goto unlock;
+
+ entry = pmd_mkyoung(orig_pmd);
+ if (write)
+ entry = pmd_mkdirty(entry);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+
+unlock:
+ spin_unlock(vmf->ptl);
+}
+
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
+ VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
+ if (is_huge_zero_pmd(orig_pmd))
+ goto fallback;
+
+ spin_lock(vmf->ptl);
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+
+ /* Lock page for reuse_swap_page() */
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+ }
+ put_page(page);
+ }
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
+ if (reuse_swap_page(page, NULL)) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ unlock_page(page);
+ spin_unlock(vmf->ptl);
+ return VM_FAULT_WRITE;
+ }
+
+ unlock_page(page);
+ spin_unlock(vmf->ptl);
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
+}
+
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+ goto out;
+
+ /* Avoid dumping huge zero page */
+ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+ return ERR_PTR(-EFAULT);
+
+ /* Full NUMA hinting faults to serialise migration in fault paths */
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if (!try_grab_page(page, flags))
+ return ERR_PTR(-ENOMEM);
+
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd, flags);
+
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * For anon THP:
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ *
+ * For file THP:
+ *
+ * We can expect PageDoubleMap() to be stable under page lock:
+ * for file pages we set it in page_add_file_rmap(), which
+ * requires page to be locked.
+ */
+
+ if (PageAnon(page) && compound_mapcount(page) != 1)
+ goto skip_mlock;
+ if (PageDoubleMap(page) || !page->mapping)
+ goto skip_mlock;
+ if (!trylock_page(page))
+ goto skip_mlock;
+ if (page->mapping && !PageDoubleMap(page))
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+skip_mlock:
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
+
+out:
+ return page;
+}
+
+/* NUMA hinting page fault entry point for trans huge pmds */
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct anon_vma *anon_vma = NULL;
+ struct page *page;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
+ int target_nid, last_cpupid = -1;
+ bool page_locked;
+ bool migrated = false;
+ bool was_writable;
+ int flags = 0;
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
+ goto out_unlock;
+
+ /*
+ * If there are potential migrations, wait for completion and retry
+ * without disrupting NUMA hinting information. Do not relock and
+ * check_same as the page may no longer be mapped.
+ */
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
+ spin_unlock(vmf->ptl);
+ put_and_wait_on_page_locked(page);
+ goto out;
+ }
+
+ page = pmd_page(pmd);
+ BUG_ON(is_huge_zero_page(page));
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (page_nid == this_nid) {
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
+
+ /* See similar comment in do_numa_page for explanation */
+ if (!pmd_savedwrite(pmd))
+ flags |= TNF_NO_GROUP;
+
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == NUMA_NO_NODE) {
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+ }
+
+ /* Migration could have started since the pmd_trans_migrating check */
+ if (!page_locked) {
+ page_nid = NUMA_NO_NODE;
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
+ spin_unlock(vmf->ptl);
+ put_and_wait_on_page_locked(page);
+ goto out;
+ }
+
+ /*
+ * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
+ * to serialises splits
+ */
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ anon_vma = page_lock_anon_vma_read(page);
+
+ /* Confirm the PMD did not change while page_table_lock was released */
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page_nid = NUMA_NO_NODE;
+ goto out_unlock;
+ }
+
+ /* Bail if we fail to protect against THP splits for any reason */
+ if (unlikely(!anon_vma)) {
+ put_page(page);
+ page_nid = NUMA_NO_NODE;
+ goto clear_pmdnuma;
+ }
+
+ /*
+ * Since we took the NUMA fault, we must have observed the !accessible
+ * bit. Make sure all other CPUs agree with that, to avoid them
+ * modifying the page we're about to migrate.
+ *
+ * Must be done under PTL such that we'll observe the relevant
+ * inc_tlb_flush_pending().
+ *
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
+
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and access rights restored.
+ */
+ spin_unlock(vmf->ptl);
+
+ migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
+ vmf->pmd, pmd, vmf->address, page, target_nid);
+ if (migrated) {
+ flags |= TNF_MIGRATED;
+ page_nid = target_nid;
+ } else
+ flags |= TNF_MIGRATE_FAIL;
+
+ goto out;
+clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
+ was_writable = pmd_savedwrite(pmd);
+ pmd = pmd_modify(pmd, vma->vm_page_prot);
+ pmd = pmd_mkyoung(pmd);
+ if (was_writable)
+ pmd = pmd_mkwrite(pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ unlock_page(page);
+out_unlock:
+ spin_unlock(vmf->ptl);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ flags);
+
+ return 0;
+}
+
+/*
+ * Return true if we do MADV_FREE successfully on entire pmd page.
+ * Otherwise, return false.
+ */
+bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, unsigned long next)
+{
+ spinlock_t *ptl;
+ pmd_t orig_pmd;
+ struct page *page;
+ struct mm_struct *mm = tlb->mm;
+ bool ret = false;
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ goto out_unlocked;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto out;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto out;
+ }
+
+ page = pmd_page(orig_pmd);
+ /*
+ * If other processes are mapping this page, we couldn't discard
+ * the page unless they all do MADV_FREE so let's skip the page.
+ */
+ if (total_mapcount(page) != 1)
+ goto out;
+
+ if (!trylock_page(page))
+ goto out;
+
+ /*
+ * If user want to discard part-pages of THP, split it so MADV_FREE
+ * will deactivate only them.
+ */
+ if (next - addr != HPAGE_PMD_SIZE) {
+ get_page(page);
+ spin_unlock(ptl);
+ split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ goto out_unlocked;
+ }
+
+ if (PageDirty(page))
+ ClearPageDirty(page);
+ unlock_page(page);
+
+ if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ mark_page_lazyfree(page);
+ ret = true;
+out:
+ spin_unlock(ptl);
+out_unlocked:
+ return ret;
+}
+
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pte_free(mm, pgtable);
+ mm_dec_nr_ptes(mm);
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+{
+ pmd_t orig_pmd;
+ spinlock_t *ptl;
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (vma_is_special_huge(vma)) {
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
+ spin_unlock(ptl);
+ if (is_huge_zero_pmd(orig_pmd))
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
+ } else if (is_huge_zero_pmd(orig_pmd)) {
+ zap_deposited_table(tlb->mm, pmd);
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
+ } else {
+ struct page *page = NULL;
+ int flush_needed = 1;
+
+ if (pmd_present(orig_pmd)) {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page, true);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ } else if (thp_migration_supported()) {
+ swp_entry_t entry;
+
+ VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
+ entry = pmd_to_swp_entry(orig_pmd);
+ page = migration_entry_to_page(entry);
+ flush_needed = 0;
+ } else
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (PageAnon(page)) {
+ zap_deposited_table(tlb->mm, pmd);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ } else {
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
+ add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ }
+
+ spin_unlock(ptl);
+ if (flush_needed)
+ tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ }
+ return 1;
+}
+
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+ spinlock_t *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ /*
+ * With split pmd lock we also need to move preallocated
+ * PTE page table if new_pmd is on different PMD page table.
+ *
+ * We also don't deposit and withdraw tables for file pages.
+ */
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+}
+#endif
+
+static pmd_t move_soft_dirty_pmd(pmd_t pmd)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ if (unlikely(is_pmd_migration_entry(pmd)))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ else if (pmd_present(pmd))
+ pmd = pmd_mksoft_dirty(pmd);
+#endif
+ return pmd;
+}
+
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ pmd_t pmd;
+ struct mm_struct *mm = vma->vm_mm;
+ bool force_flush = false;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ return false;
+ }
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+ if (old_ptl) {
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
+ if (pmd_present(pmd))
+ force_flush = true;
+ VM_BUG_ON(!pmd_none(*new_pmd));
+
+ if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
+ pgtable_t pgtable;
+ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+ pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+ }
+ pmd = move_soft_dirty_pmd(pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ if (force_flush)
+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (is_swap_pmd(*pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+ if (is_write_migration_entry(entry)) {
+ pmd_t newpmd;
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*pmd))
+ newpmd = pmd_swp_mksoft_dirty(newpmd);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
+ set_pmd_at(mm, addr, pmd, newpmd);
+ }
+ goto unlock;
+ }
+#endif
+
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+
+ /*
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under mmap_read_lock(mm):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = pmdp_invalidate(vma, addr, pmd);
+
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ if (uffd_wp) {
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkuffd_wp(entry);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled by PF interrupt
+ * handler, then things like COW could be properly
+ * handled.
+ */
+ entry = pmd_clear_uffd_wp(entry);
+ }
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
+ return ret;
+}
+
+/*
+ * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
+ *
+ * Note that if it returns page table lock pointer, this routine returns without
+ * unlocking page table lock. So callers must unlock it.
+ */
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+ spinlock_t *ptl;
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
+ pmd_devmap(*pmd)))
+ return ptl;
+ spin_unlock(ptl);
+ return NULL;
+}
+
+/*
+ * Returns true if a given pud maps a thp, false otherwise.
+ *
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
+ */
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+ spinlock_t *ptl;
+
+ ptl = pud_lock(vma->vm_mm, pud);
+ if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+ return ptl;
+ spin_unlock(ptl);
+ return NULL;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr)
+{
+ spinlock_t *ptl;
+
+ ptl = __pud_trans_huge_lock(pud, vma);
+ if (!ptl)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pudp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pudp related
+ * operations.
+ */
+ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+ tlb_remove_pud_tlb_entry(tlb, pud, addr);
+ if (vma_is_special_huge(vma)) {
+ spin_unlock(ptl);
+ /* No zero page support yet */
+ } else {
+ /* No support for anonymous PUD pages yet */
+ BUG();
+ }
+ return 1;
+}
+
+static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long haddr)
+{
+ VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
+ VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+
+ count_vm_event(THP_SPLIT_PUD);
+
+ pudp_huge_clear_flush_notify(vma, haddr, pud);
+}
+
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pud_lock(vma->vm_mm, pud);
+ if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+ goto out;
+ __split_huge_pud_locked(vma, pud, range.start);
+
+out:
+ spin_unlock(ptl);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pudp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd, old_pmd;
+ int i;
+
+ /*
+ * Leave pmd empty until pte is filled note that it is fine to delay
+ * notification until mmu_notifier_invalidate_range_end() as we are
+ * replacing a zero pmd write protected page with a zero pte write
+ * protected page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ if (pmd_uffd_wp(old_pmd))
+ entry = pte_mkuffd_wp(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+}
+
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t old_pmd, _pmd;
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ unsigned long addr;
+ int i;
+
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+ && !pmd_devmap(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
+
+ if (!vma_is_anonymous(vma)) {
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ /*
+ * We are going to unmap this huge page. So
+ * just go ahead and zap it
+ */
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
+ if (vma_is_special_huge(vma))
+ return;
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
+ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ return;
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
+ /*
+ * FIXME: Do we want to invalidate secondary mmu by calling
+ * mmu_notifier_invalidate_range() see comments below inside
+ * __split_huge_pmd() ?
+ *
+ * We are going from a zero huge page write protected to zero
+ * small page also write protected so it does not seems useful
+ * to invalidate secondary mmu at this time.
+ */
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
+ }
+
+ /*
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * must remain set at all times on the pmd until the split is complete
+ * for this pmd), then we flush the SMP TLB and finally we write the
+ * non-huge version of the pmd entry with pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
+ pmd_migration = is_pmd_migration_entry(old_pmd);
+ if (unlikely(pmd_migration)) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ write = is_write_migration_entry(entry);
+ young = false;
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+ } else {
+ page = pmd_page(old_pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
+ uffd_wp = pmd_uffd_wp(old_pmd);
+ }
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
+
+ /*
+ * Withdraw the table only after we mark the pmd entry invalid.
+ * This's critical for some architectures (Power).
+ */
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze || pmd_migration) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ } else {
+ entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
+ entry = maybe_mkwrite(entry, vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ if (soft_dirty)
+ entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
+ }
+ pte = pte_offset_map(&_pmd, addr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, addr, pte, entry);
+ if (!pmd_migration)
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
+
+ if (!pmd_migration) {
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 &&
+ !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ lock_page_memcg(page);
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+ unlock_page_memcg(page);
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
+}
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address, bool freeze, struct page *page)
+{
+ spinlock_t *ptl;
+ struct mmu_notifier_range range;
+ bool do_unlock_page = false;
+ pmd_t _pmd;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ if (page != pmd_page(*pmd))
+ goto out;
+ }
+
+repeat:
+ if (pmd_trans_huge(*pmd)) {
+ if (!page) {
+ page = pmd_page(*pmd);
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pmd_range()
+ * calls __split_huge_pmd() while i_mmap_lock is held.
+ */
+ if (PageAnon(page)) {
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ do_unlock_page = true;
+ }
+ }
+ if (PageMlocked(page))
+ clear_page_mlock(page);
+ } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
+ goto out;
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+out:
+ spin_unlock(ptl);
+ if (do_unlock_page)
+ unlock_page(page);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback.
+ * They are 3 cases to consider inside __split_huge_pmd_locked():
+ * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
+ * 2) __split_huge_zero_page_pmd() read only zero page and any write
+ * fault will trigger a flush_notify before pointing to a new page
+ * (it is fine if the secondary mmu keeps pointing to the old zero
+ * page in the meantime)
+ * 3) Split a huge pmd into pte pointing to the same page. No need
+ * to invalidate secondary tlb entry they are all still valid.
+ * any further changes to individual pte will notify. So no need
+ * to call mmu_notifier->invalidate_range()
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return;
+
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+
+ __split_huge_pmd(vma, pmd, address, freeze, page);
+}
+
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_pmd_address(vma, start, false, NULL);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_pmd_address(vma, end, false, NULL);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't hpage aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_pmd_address(next, nstart, false, NULL);
+ }
+}
+
+static void unmap_page(struct page *page)
+{
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ if (PageAnon(page))
+ ttu_flags |= TTU_SPLIT_FREEZE;
+
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
+}
+
+static void remap_page(struct page *page, unsigned int nr)
+{
+ int i;
+ if (PageTransHuge(page)) {
+ remove_migration_ptes(page, page, true);
+ } else {
+ for (i = 0; i < nr; i++)
+ remove_migration_ptes(page + i, page + i, true);
+ }
+}
+
+static void __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ struct page *page_tail = head + tail;
+
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+
+ /*
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for exmaple lock_page() which set PG_waiters.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_workingset) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
+#ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+#endif
+ (1L << PG_dirty)));
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+ page_tail->index = head->index + tail;
+
+ /* Page flags must be visible before we make the page non-compound. */
+ smp_wmb();
+
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
+ clear_compound_head(page_tail);
+
+ /* Finally unfreeze refcount. Additional reference from page cache. */
+ page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
+ PageSwapCache(head)));
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+
+ /*
+ * always add to the tail because some iterators expect new
+ * pages to show after the currently processed elements - e.g.
+ * migrate_pages
+ */
+ lru_add_page_tail(head, page_tail, lruvec, list);
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list,
+ pgoff_t end, unsigned long flags)
+{
+ struct page *head = compound_head(page);
+ pg_data_t *pgdat = page_pgdat(head);
+ struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
+ int i;
+
+ lruvec = mem_cgroup_page_lruvec(head, pgdat);
+
+ /* complete memcg works before add pages to LRU */
+ split_page_memcg(head, nr);
+
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
+ for (i = nr - 1; i >= 1; i--) {
+ __split_huge_page_tail(head, i, lruvec, list);
+ /* Some pages can be beyond i_size: drop them from page cache */
+ if (head[i].index >= end) {
+ ClearPageDirty(head + i);
+ __delete_from_page_cache(head + i, NULL);
+ if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
+ shmem_uncharge(head->mapping->host, 1);
+ put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
+ }
+ }
+
+ ClearPageCompound(head);
+
+ split_page_owner(head, nr);
+
+ /* See comment in __split_huge_page_tail() */
+ if (PageAnon(head)) {
+ /* Additional pin to swap cache */
+ if (PageSwapCache(head)) {
+ page_ref_add(head, 2);
+ xa_unlock(&swap_cache->i_pages);
+ } else {
+ page_ref_inc(head);
+ }
+ } else {
+ /* Additional pin to page cache */
+ page_ref_add(head, 2);
+ xa_unlock(&head->mapping->i_pages);
+ }
+
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+
+ remap_page(head, nr);
+
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
+ }
+}
+
+int total_mapcount(struct page *page)
+{
+ int i, compound, nr, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ compound = compound_mapcount(page);
+ nr = compound_nr(page);
+ if (PageHuge(page))
+ return compound;
+ ret = compound;
+ for (i = 0; i < nr; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ /* File pages has compound_mapcount included in _mapcount */
+ if (!PageAnon(page))
+ return ret - compound * nr;
+ if (PageDoubleMap(page))
+ ret -= nr;
+ return ret;
+}
+
+/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+ int i, ret, _total_mapcount, mapcount;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = ret = 0;
+ for (i = 0; i < thp_nr_pages(page); i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ _total_mapcount += mapcount;
+ }
+ if (PageDoubleMap(page)) {
+ ret -= 1;
+ _total_mapcount -= thp_nr_pages(page);
+ }
+ mapcount = compound_mapcount(page);
+ ret += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ return ret;
+}
+
+/* Racy check whether the huge page can be split */
+bool can_split_huge_page(struct page *page, int *pextra_pins)
+{
+ int extra_pins;
+
+ /* Additional pins from page cache */
+ if (PageAnon(page))
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
+ else
+ extra_pins = thp_nr_pages(page);
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
+ int extra_pins, ret;
+ unsigned long flags;
+ pgoff_t end;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
+
+ if (PageWriteback(head))
+ return -EBUSY;
+
+ if (PageAnon(head)) {
+ /*
+ * The caller does not necessarily hold an mmap_lock that would
+ * prevent the anon_vma disappearing so we first we take a
+ * reference to it and then lock the anon_vma for write. This
+ * is similar to page_lock_anon_vma_read except the write lock
+ * is taken to serialise against parallel split or collapse
+ * operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ end = -1;
+ mapping = NULL;
+ anon_vma_lock_write(anon_vma);
+ } else {
+ mapping = head->mapping;
+
+ /* Truncated ? */
+ if (!mapping) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ anon_vma = NULL;
+ i_mmap_lock_read(mapping);
+
+ /*
+ *__split_huge_page() may need to trim off pages beyond EOF:
+ * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+ * which cannot be nested inside the page tree lock. So note
+ * end now: i_size itself may be changed at any moment, but
+ * head page lock is good enough to serialize the trimming.
+ */
+ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ }
+
+ /*
+ * Racy check if we can split the page, before unmap_page() will
+ * split PMDs
+ */
+ if (!can_split_huge_page(head, &extra_pins)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ unmap_page(head);
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irqsave(&pgdata->lru_lock, flags);
+
+ if (mapping) {
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
+
+ /*
+ * Check if the head page is present in page cache.
+ * We assume all tail are present too, if head is there.
+ */
+ xa_lock(&mapping->i_pages);
+ if (xas_load(&xas) != head)
+ goto fail;
+ }
+
+ /* Prevent deferred_split_scan() touching ->_refcount */
+ spin_lock(&ds_queue->split_queue_lock);
+ if (page_ref_freeze(head, 1 + extra_pins)) {
+ if (!list_empty(page_deferred_list(head))) {
+ ds_queue->split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock(&ds_queue->split_queue_lock);
+ if (mapping) {
+ if (PageSwapBacked(head))
+ __dec_node_page_state(head, NR_SHMEM_THPS);
+ else
+ __dec_node_page_state(head, NR_FILE_THPS);
+ }
+
+ __split_huge_page(page, list, end, flags);
+ ret = 0;
+ } else {
+ spin_unlock(&ds_queue->split_queue_lock);
+fail:
+ if (mapping)
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+ remap_page(head, thp_nr_pages(head));
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ if (anon_vma) {
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+ }
+ if (mapping)
+ i_mmap_unlock_read(mapping);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ ds_queue->split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ if (!list_empty(page_deferred_list(page)))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ if (get_page_unless_zero(page)) {
+ list_move(page_deferred_list(page), &list);
+ } else {
+ /* We lost race with put_compound_page() */
+ list_del_init(page_deferred_list(page));
+ ds_queue->split_queue_len--;
+ }
+ if (!--sc->nr_to_scan)
+ break;
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ if (!trylock_page(page))
+ goto next;
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+next:
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ /*
+ * Stop shrinker if we didn't split any page, but the queue is empty.
+ * This can happen if pages were freed under us.
+ */
+ if (!split && list_empty(&ds_queue->split_queue))
+ return SHRINK_STOP;
+ return split;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+ struct zone *zone;
+ struct page *page;
+ unsigned long pfn, max_zone_pfn;
+ unsigned long total = 0, split = 0;
+
+ if (val != 1)
+ return -EINVAL;
+
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (!get_page_unless_zero(page))
+ continue;
+
+ if (zone != page_zone(page))
+ goto next;
+
+ if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
+ goto next;
+
+ total++;
+ lock_page(page);
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+next:
+ put_page(page);
+ }
+ }
+
+ pr_info("%lu of %lu THP split\n", split, total);
+
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+ "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
+ &split_huge_pages_fops);
+ return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+ struct page *page)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ pmd_t pmdval;
+ swp_entry_t entry;
+ pmd_t pmdswp;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ if (pmd_dirty(pmdval))
+ set_page_dirty(page);
+ entry = make_migration_entry(page, pmd_write(pmdval));
+ pmdswp = swp_entry_to_pmd(entry);
+ if (pmd_soft_dirty(pmdval))
+ pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+ set_pmd_at(mm, address, pvmw->pmd, pmdswp);
+ page_remove_rmap(page, true);
+ put_page(page);
+}
+
+void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
+{
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address = pvmw->address;
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ pmd_t pmde;
+ swp_entry_t entry;
+
+ if (!(pvmw->pmd && !pvmw->pte))
+ return;
+
+ entry = pmd_to_swp_entry(*pvmw->pmd);
+ get_page(new);
+ pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+ if (pmd_swp_soft_dirty(*pvmw->pmd))
+ pmde = pmd_mksoft_dirty(pmde);
+ if (is_write_migration_entry(entry))
+ pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+
+ flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, mmun_start, true);
+ else
+ page_add_file_rmap(new, true);
+ set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+ if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
+ mlock_vma_page(new);
+ update_mmu_cache_pmd(vma, address, pvmw->pmd);
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 000000000..81949f6d2
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,5788 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic hugetlb support.
+ * (C) Nadia Yvette Chambers, April 2004
+ */
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
+#include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/compiler.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/memblock.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/mmdebug.h>
+#include <linux/sched/signal.h>
+#include <linux/rmap.h>
+#include <linux/string_helpers.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/jhash.h>
+#include <linux/numa.h>
+#include <linux/llist.h>
+#include <linux/cma.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#include <linux/io.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/node.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
+#include "internal.h"
+
+int hugetlb_max_hstate __read_mostly;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+
+#ifdef CONFIG_CMA
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+#endif
+static unsigned long hugetlb_cma_size __initdata;
+
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
+
+__initdata LIST_HEAD(huge_boot_pages);
+
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static bool __initdata parsed_valid_hugepagesz = true;
+static bool __initdata parsed_default_hugepagesz;
+
+/*
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
+ */
+DEFINE_SPINLOCK(hugetlb_lock);
+
+/*
+ * Serializes faults on the same logical page. This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static int num_fault_mutexes;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+
+static inline bool PageHugeFreed(struct page *head)
+{
+ return page_private(head + 4) == -1UL;
+}
+
+static inline void SetPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, -1UL);
+}
+
+static inline void ClearPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, 0);
+}
+
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, give up any reservations based on minimum size and
+ * free the subpool */
+ if (free) {
+ if (spool->min_hpages != -1)
+ hugetlb_acct_memory(spool->hstate,
+ -spool->min_hpages);
+ kfree(spool);
+ }
+}
+
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kzalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = max_hpages;
+ spool->hstate = h;
+ spool->min_hpages = min_hpages;
+
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ kfree(spool);
+ return NULL;
+ }
+ spool->rsv_hpages = min_hpages;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * request. Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward). The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ long ret = delta;
+
+ if (!spool)
+ return ret;
+
+ spin_lock(&spool->lock);
+
+ if (spool->max_hpages != -1) { /* maximum size accounting */
+ if ((spool->used_hpages + delta) <= spool->max_hpages)
+ spool->used_hpages += delta;
+ else {
+ ret = -ENOMEM;
+ goto unlock_ret;
+ }
+ }
+
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->rsv_hpages) {
+ if (delta > spool->rsv_hpages) {
+ /*
+ * Asking for more reserves than those already taken on
+ * behalf of subpool. Return difference.
+ */
+ ret = delta - spool->rsv_hpages;
+ spool->rsv_hpages = 0;
+ } else {
+ ret = 0; /* reserves already accounted for */
+ spool->rsv_hpages -= delta;
+ }
+ }
+
+unlock_ret:
+ spin_unlock(&spool->lock);
+ return ret;
+}
+
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ long ret = delta;
+
+ if (!spool)
+ return delta;
+
+ spin_lock(&spool->lock);
+
+ if (spool->max_hpages != -1) /* maximum size accounting */
+ spool->used_hpages -= delta;
+
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
+ if (spool->rsv_hpages + delta <= spool->min_hpages)
+ ret = 0;
+ else
+ ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+ spool->rsv_hpages += delta;
+ if (spool->rsv_hpages > spool->min_hpages)
+ spool->rsv_hpages = spool->min_hpages;
+ }
+
+ /*
+ * If hugetlbfs_put_super couldn't free spool due to an outstanding
+ * quota reference, free it now.
+ */
+ unlock_or_release_subpool(spool);
+
+ return ret;
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(file_inode(vma->vm_file));
+}
+
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
+ */
+static struct file_region *
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
+{
+ struct file_region *nrg = NULL;
+
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
+ list_del(&nrg->link);
+
+ nrg->from = from;
+ nrg->to = to;
+
+ return nrg;
+}
+
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
+ struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ nrg->reservation_counter = rg->reservation_counter;
+ nrg->css = rg->css;
+ if (rg->css)
+ css_get(rg->css);
+#endif
+}
+
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+ struct hstate *h,
+ struct resv_map *resv,
+ struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (h_cg) {
+ nrg->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ nrg->css = &h_cg->css;
+ /*
+ * The caller will hold exactly one h_cg->css reference for the
+ * whole contiguous reservation region. But this area might be
+ * scattered when there are already some file_regions reside in
+ * it. As a result, many file_regions may share only one css
+ * reference. In order to ensure that one file_region must hold
+ * exactly one h_cg->css reference, we should do css_get for
+ * each file_region and leave the reference held by caller
+ * untouched.
+ */
+ css_get(&h_cg->css);
+ if (!resv->pages_per_hpage)
+ resv->pages_per_hpage = pages_per_huge_page(h);
+ /* pages_per_hpage should be the same for all entries in
+ * a resv_map.
+ */
+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
+ } else {
+ nrg->reservation_counter = NULL;
+ nrg->css = NULL;
+ }
+#endif
+}
+
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (rg->css)
+ css_put(rg->css);
+#endif
+}
+
+static bool has_same_uncharge_info(struct file_region *rg,
+ struct file_region *org)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ return rg && org &&
+ rg->reservation_counter == org->reservation_counter &&
+ rg->css == org->css;
+
+#else
+ return true;
+#endif
+}
+
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
+{
+ struct file_region *nrg = NULL, *prg = NULL;
+
+ prg = list_prev_entry(rg, link);
+ if (&prg->link != &resv->regions && prg->to == rg->from &&
+ has_same_uncharge_info(prg, rg)) {
+ prg->to = rg->to;
+
+ list_del(&rg->link);
+ put_uncharge_info(rg);
+ kfree(rg);
+
+ rg = prg;
+ }
+
+ nrg = list_next_entry(rg, link);
+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
+ has_same_uncharge_info(nrg, rg)) {
+ nrg->from = rg->from;
+
+ list_del(&rg->link);
+ put_uncharge_info(rg);
+ kfree(rg);
+ }
+}
+
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
+ */
+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h, long *regions_needed)
+{
+ long add = 0;
+ struct list_head *head = &resv->regions;
+ long last_accounted_offset = f;
+ struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+
+ if (regions_needed)
+ *regions_needed = 0;
+
+ /* In this loop, we essentially handle an entry for the range
+ * [last_accounted_offset, rg->from), at every iteration, with some
+ * bounds checking.
+ */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ /* Skip irrelevant regions that start before our range. */
+ if (rg->from < f) {
+ /* If this region ends after the last accounted offset,
+ * then we need to update last_accounted_offset.
+ */
+ if (rg->to > last_accounted_offset)
+ last_accounted_offset = rg->to;
+ continue;
+ }
+
+ /* When we find a region that starts beyond our range, we've
+ * finished.
+ */
+ if (rg->from > t)
+ break;
+
+ /* Add an entry for last_accounted_offset -> rg->from, and
+ * update last_accounted_offset.
+ */
+ if (rg->from > last_accounted_offset) {
+ add += rg->from - last_accounted_offset;
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, rg->from);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
+ resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else
+ *regions_needed += 1;
+ }
+
+ last_accounted_offset = rg->to;
+ }
+
+ /* Handle the case where our range extends beyond
+ * last_accounted_offset.
+ */
+ if (last_accounted_offset < t) {
+ add += t - last_accounted_offset;
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, t);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else
+ *regions_needed += 1;
+ }
+
+ VM_BUG_ON(add < 0);
+ return add;
+}
+
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
+ */
+static int allocate_file_region_entries(struct resv_map *resv,
+ int regions_needed)
+ __must_hold(&resv->lock)
+{
+ struct list_head allocated_regions;
+ int to_allocate = 0, i = 0;
+ struct file_region *trg = NULL, *rg = NULL;
+
+ VM_BUG_ON(regions_needed < 0);
+
+ INIT_LIST_HEAD(&allocated_regions);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations plus regions_needed.
+ *
+ * This is a while loop because when we drop the lock, some other call
+ * to region_add or region_del may have consumed some region_entries,
+ * so we keep looping here until we finally have enough entries for
+ * (adds_in_progress + regions_needed).
+ */
+ while (resv->region_cache_count <
+ (resv->adds_in_progress + regions_needed)) {
+ to_allocate = resv->adds_in_progress + regions_needed -
+ resv->region_cache_count;
+
+ /* At this point, we should have enough entries in the cache
+ * for all the existings adds_in_progress. We should only be
+ * needing to allocate for regions_needed.
+ */
+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
+
+ spin_unlock(&resv->lock);
+ for (i = 0; i < to_allocate; i++) {
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ goto out_of_memory;
+ list_add(&trg->link, &allocated_regions);
+ }
+
+ spin_lock(&resv->lock);
+
+ list_splice(&allocated_regions, &resv->region_cache);
+ resv->region_cache_count += to_allocate;
+ }
+
+ return 0;
+
+out_of_memory:
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map. Regions will be taken from the cache to fill in this range.
+ * Sufficient regions should exist in the cache due to the previous
+ * call to region_chg with the same range, but in some cases the cache will not
+ * have sufficient entries due to races with other code doing region_add or
+ * region_del. The extra needed entries will be allocated.
+ *
+ * regions_needed is the out value provided by a previous call to region_chg.
+ *
+ * Return the number of new huge pages added to the map. This number is greater
+ * than or equal to zero. If file_region entries needed to be allocated for
+ * this operation and we were not able to allocate, it returns -ENOMEM.
+ * region_add of regions of length 1 never allocate file_regions and cannot
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
+ * 1 page will only require at most 1 entry.
+ */
+static long region_add(struct resv_map *resv, long f, long t,
+ long in_regions_needed, struct hstate *h,
+ struct hugetlb_cgroup *h_cg)
+{
+ long add = 0, actual_regions_needed = 0;
+
+ spin_lock(&resv->lock);
+retry:
+
+ /* Count how many regions are actually needed to execute this add. */
+ add_reservation_in_range(resv, f, t, NULL, NULL,
+ &actual_regions_needed);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * this add operation. Note that actual_regions_needed may be greater
+ * than in_regions_needed, as the resv_map may have been modified since
+ * the region_chg call. In this case, we need to make sure that we
+ * allocate extra entries, such that we have enough for all the
+ * existing adds_in_progress, plus the excess needed for this
+ * operation.
+ */
+ if (actual_regions_needed > in_regions_needed &&
+ resv->region_cache_count <
+ resv->adds_in_progress +
+ (actual_regions_needed - in_regions_needed)) {
+ /* region_add operation of range 1 should never need to
+ * allocate file_region entries.
+ */
+ VM_BUG_ON(t - f <= 1);
+
+ if (allocate_file_region_entries(
+ resv, actual_regions_needed - in_regions_needed)) {
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
+
+ resv->adds_in_progress -= in_regions_needed;
+
+ spin_unlock(&resv->lock);
+ VM_BUG_ON(add < 0);
+ return add;
+}
+
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented. This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t). region_chg does
+ * not change the number of huge pages represented by the
+ * map. A number of new file_region structures is added to the cache as a
+ * placeholder, for the subsequent region_add call to use. At least 1
+ * file_region structure is added.
+ *
+ * out_regions_needed is the number of regions added to the
+ * resv->adds_in_progress. This value needs to be provided to a follow up call
+ * to region_add or region_abort for proper accounting.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t). This number is greater or equal to
+ * zero. -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
+ */
+static long region_chg(struct resv_map *resv, long f, long t,
+ long *out_regions_needed)
+{
+ long chg = 0;
+
+ spin_lock(&resv->lock);
+
+ /* Count how many hugepages in this range are NOT represented. */
+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
+ out_regions_needed);
+
+ if (*out_regions_needed == 0)
+ *out_regions_needed = 1;
+
+ if (allocate_file_region_entries(resv, *out_regions_needed))
+ return -ENOMEM;
+
+ resv->adds_in_progress += *out_regions_needed;
+
+ spin_unlock(&resv->lock);
+ return chg;
+}
+
+/*
+ * Abort the in progress add operation. The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add. Operations are sometimes
+ * aborted after the call to region_chg. In such cases, region_abort
+ * is called to decrement the adds_in_progress counter. regions_needed
+ * is the value returned by the region_chg call, it is used to decrement
+ * the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine. They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
+ */
+static void region_abort(struct resv_map *resv, long f, long t,
+ long regions_needed)
+{
+ spin_lock(&resv->lock);
+ VM_BUG_ON(!resv->region_cache_count);
+ resv->adds_in_progress -= regions_needed;
+ spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map. If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted. Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more. In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated. If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM. Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg, *trg;
+ struct file_region *nrg = NULL;
+ long del = 0;
+
+retry:
+ spin_lock(&resv->lock);
+ list_for_each_entry_safe(rg, trg, head, link) {
+ /*
+ * Skip regions before the range to be deleted. file_region
+ * ranges are normally of the form [from, to). However, there
+ * may be a "placeholder" entry in the map which is of the form
+ * (from, to) with from == to. Check for placeholder entries
+ * at the beginning of the range to be deleted.
+ */
+ if (rg->to <= f && (rg->to != rg->from || rg->to != f))
+ continue;
+
+ if (rg->from >= t)
+ break;
+
+ if (f > rg->from && t < rg->to) { /* Must split region */
+ /*
+ * Check for an entry in the cache before dropping
+ * lock and attempting allocation.
+ */
+ if (!nrg &&
+ resv->region_cache_count > resv->adds_in_progress) {
+ nrg = list_first_entry(&resv->region_cache,
+ struct file_region,
+ link);
+ list_del(&nrg->link);
+ resv->region_cache_count--;
+ }
+
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ del += t - f;
+ hugetlb_cgroup_uncharge_file_region(
+ resv, rg, t - f, false);
+
+ /* New entry for end of split region */
+ nrg->from = t;
+ nrg->to = rg->to;
+
+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
+
+ INIT_LIST_HEAD(&nrg->link);
+
+ /* Original entry is trimmed */
+ rg->to = f;
+
+ list_add(&nrg->link, &rg->link);
+ nrg = NULL;
+ break;
+ }
+
+ if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+ del += rg->to - rg->from;
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - rg->from, true);
+ list_del(&rg->link);
+ kfree(rg);
+ continue;
+ }
+
+ if (f <= rg->from) { /* Trim beginning of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ t - rg->from, false);
+
+ del += t - rg->from;
+ rg->from = t;
+ } else { /* Trim end of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - f, false);
+
+ del += rg->to - f;
+ rg->to = f;
+ }
+ }
+
+ spin_unlock(&resv->lock);
+ kfree(nrg);
+ return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page. The huge page itself was free'ed
+ * and removed from the page cache. This routine will adjust the subpool
+ * usage count, and the global reserve count if needed. By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode)
+{
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long rsv_adjust;
+ bool reserved = false;
+
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ if (rsv_adjust > 0) {
+ struct hstate *h = hstate_inode(inode);
+
+ if (!hugetlb_acct_memory(h, 1))
+ reserved = true;
+ } else if (!rsv_adjust) {
+ reserved = true;
+ }
+
+ if (!reserved)
+ pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
+}
+
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
+static long region_count(struct resv_map *resv, long f, long t)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg;
+ long chg = 0;
+
+ spin_lock(&resv->lock);
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ long seg_from;
+ long seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+ spin_unlock(&resv->lock);
+
+ return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
+
+/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ if (vma->vm_ops && vma->vm_ops->pagesize)
+ return vma->vm_ops->pagesize(vma);
+ return PAGE_SIZE;
+}
+EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific 'strong'
+ * version of this symbol is required.
+ */
+__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+ return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+ unsigned long value)
+{
+ vma->vm_private_data = (void *)value;
+}
+
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (!h_cg || !h) {
+ resv_map->reservation_counter = NULL;
+ resv_map->pages_per_hpage = 0;
+ resv_map->css = NULL;
+ } else {
+ resv_map->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
+ resv_map->css = &h_cg->css;
+ }
+#endif
+}
+
+struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+ if (!resv_map || !rg) {
+ kfree(resv_map);
+ kfree(rg);
+ return NULL;
+ }
+
+ kref_init(&resv_map->refs);
+ spin_lock_init(&resv_map->lock);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ resv_map->adds_in_progress = 0;
+ /*
+ * Initialize these to 0. On shared mappings, 0's here indicate these
+ * fields don't do cgroup accounting. On private mappings, these will be
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
+ * reservations are to be un-charged from here.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
+
+ INIT_LIST_HEAD(&resv_map->region_cache);
+ list_add(&rg->link, &resv_map->region_cache);
+ resv_map->region_cache_count = 1;
+
+ return resv_map;
+}
+
+void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+ struct list_head *head = &resv_map->region_cache;
+ struct file_region *rg, *trg;
+
+ /* Clear out any active regions before we release the map. */
+ region_del(resv_map, 0, LONG_MAX);
+
+ /* ... and any entries left in the cache */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+ VM_BUG_ON(resv_map->adds_in_progress);
+
+ kfree(resv_map);
+}
+
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ return inode_resv_map(inode);
+
+ } else {
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
+ }
+}
+
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+
+ return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
+{
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decrement reserved count. Without decrementing,
+ * reserve count remains after releasing inode, because this
+ * allocated page will go into page cache and is regarded as
+ * coming from reserved pool in releasing step. Currently, we
+ * don't have any other solution to deal with this situation
+ * properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return true;
+ else
+ return false;
+ }
+
+ /* Shared mappings always use reserves */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * We know VM_NORESERVE is not set. Therefore, there SHOULD
+ * be a region map for all pages. The only situation where
+ * there is no region map is if a hole was punched via
+ * fallocate. In this case, there really are no reserves to
+ * use. This situation is indicated if chg != 0.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
+
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Like the shared case above, a hole punch or truncate
+ * could have been performed on the private mapping.
+ * Examine the value of chg to determine if reserves
+ * actually exist or were previously consumed.
+ * Very Subtle - The value of chg comes from a previous
+ * call to vma_needs_reserves(). The reserve map for
+ * private mappings has different (opposite) semantics
+ * than that of shared mappings. vma_needs_reserves()
+ * has already taken this difference in semantics into
+ * account. Therefore, the meaning of chg is the same
+ * as in the shared case above. Code could easily be
+ * combined, but keeping it separate draws attention to
+ * subtle differences.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
+
+ return false;
+}
+
+static void enqueue_huge_page(struct hstate *h, struct page *page)
+{
+ int nid = page_to_nid(page);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages++;
+ h->free_huge_pages_node[nid]++;
+ SetPageHugeFreed(page);
+}
+
+static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
+{
+ struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
+
+ if (PageHWPoison(page))
+ continue;
+
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ ClearPageHugeFreed(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
+ nodemask_t *nmask)
+{
+ unsigned int cpuset_mems_cookie;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ int node = NUMA_NO_NODE;
+
+ zonelist = node_zonelist(nid, gfp_mask);
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
+ struct page *page;
+
+ if (!cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+ /*
+ * no need to ask again on the same node. Pool is node rather than
+ * zone aware
+ */
+ if (zone_to_nid(zone) == node)
+ continue;
+ node = zone_to_nid(zone);
+
+ page = dequeue_huge_page_node_exact(h, node);
+ if (page)
+ return page;
+ }
+ if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return NULL;
+}
+
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address, int avoid_reserve,
+ long chg)
+{
+ struct page *page;
+ struct mempolicy *mpol;
+ gfp_t gfp_mask;
+ nodemask_t *nodemask;
+ int nid;
+
+ /*
+ * A child process with MAP_PRIVATE mappings created by their parent
+ * have no page reserves. This check ensures that reservations are
+ * not "stolen". The child may still get SIGKILLed
+ */
+ if (!vma_has_reserves(vma, chg) &&
+ h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+ /* If reserves cannot be used, ensure enough pages are in the pool */
+ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+ gfp_mask = htlb_alloc_mask(h);
+ nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
+ }
+
+ mpol_cond_put(mpol);
+ return page;
+
+err:
+ return NULL;
+}
+
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node_in(nid, *nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned int order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ atomic_set(compound_mapcount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
+
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+
+ set_compound_order(page, 0);
+ page[1].compound_nr = 0;
+ __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned int order)
+{
+ /*
+ * If the page isn't allocated using the cma allocator,
+ * cma_release() returns false.
+ */
+#ifdef CONFIG_CMA
+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ return;
+#endif
+
+ free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+#ifdef CONFIG_CONTIG_ALLOC
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ unsigned long nr_pages = 1UL << huge_page_order(h);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+
+#ifdef CONFIG_CMA
+ {
+ struct page *page;
+ int node;
+
+ if (hugetlb_cma[nid]) {
+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+
+ if (!(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+ }
+ }
+#endif
+
+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+}
+
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
+
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+ unsigned int order) { }
+#endif
+
+static void update_and_free_page(struct hstate *h, struct page *page)
+{
+ int i;
+ struct page *subpage = page;
+
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return;
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[page_to_nid(page)]--;
+ for (i = 0; i < pages_per_huge_page(h);
+ i++, subpage = mem_map_next(subpage, page, i)) {
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
+ }
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ set_page_refcounted(page);
+ if (hstate_is_gigantic(h)) {
+ /*
+ * Temporarily drop the hugetlb_lock, because
+ * we might block in free_gigantic_page().
+ */
+ spin_unlock(&hugetlb_lock);
+ destroy_compound_gigantic_page(page, huge_page_order(h));
+ free_gigantic_page(page, huge_page_order(h));
+ spin_lock(&hugetlb_lock);
+ } else {
+ __free_pages(page, huge_page_order(h));
+ }
+}
+
+struct hstate *size_to_hstate(unsigned long size)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
+}
+
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+ return PageHeadHuge(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+void set_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ ClearPagePrivate(&page[1]);
+}
+
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+ if (!PageHuge(page))
+ return false;
+
+ return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+ page[2].mapping = NULL;
+}
+
+static void __free_huge_page(struct page *page)
+{
+ /*
+ * Can't pass hstate in here because it is called from the
+ * compound page destructor.
+ */
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
+ bool restore_reserve;
+
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ restore_reserve = PagePrivate(page);
+ ClearPagePrivate(page);
+
+ /*
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
+ */
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
+
+ spin_lock(&hugetlb_lock);
+ clear_page_huge_active(page);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
+ if (restore_reserve)
+ h->resv_huge_pages++;
+
+ if (PageHugeTemporary(page)) {
+ list_del(&page->lru);
+ ClearPageHugeTemporary(page);
+ update_and_free_page(h, page);
+ } else if (h->surplus_huge_pages_node[nid]) {
+ /* remove the page from active list */
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ } else {
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+}
+
+/*
+ * As free_huge_page() can be called from a non-task context, we have
+ * to defer the actual freeing in a workqueue to prevent potential
+ * hugetlb_lock deadlock.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
+ * be freed and frees them one-by-one. As the page->mapping pointer is
+ * going to be cleared in __free_huge_page() anyway, it is reused as the
+ * llist_node structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+ struct page *page;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ __free_huge_page(page);
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+void free_huge_page(struct page *page)
+{
+ /*
+ * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
+ */
+ if (!in_task()) {
+ /*
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the
+ * workfn hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping,
+ &hpage_freelist))
+ schedule_work(&free_hpage_work);
+ return;
+ }
+
+ __free_huge_page(page);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ INIT_LIST_HEAD(&page->lru);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_hugetlb_cgroup(page, NULL);
+ set_hugetlb_cgroup_rsvd(page, NULL);
+ spin_lock(&hugetlb_lock);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+ ClearPageHugeFreed(page);
+ spin_unlock(&hugetlb_lock);
+}
+
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ /* we rely on prep_new_huge_page to set the destructor */
+ set_compound_order(page, order);
+ __ClearPageReserved(page);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ /*
+ * For gigantic hugepages allocated through bootmem at
+ * boot, it's safer to be consistent with the not-gigantic
+ * hugepages and clear the PG_reserved bit from all tail pages
+ * too. Otherwise drivers using get_user_pages() to access tail
+ * pages may get the reference counting wrong if they see
+ * PG_reserved set on a tail page (despite the head page not
+ * having PG_reserved set). Enforcing this consistency between
+ * head and tail pages allows drivers to optimize away a check
+ * on the head page when they need know if put_page() is needed
+ * after get_user_pages().
+ */
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ set_compound_head(p, page);
+ }
+ atomic_set(compound_mapcount_ptr(page), -1);
+ atomic_set(compound_pincount_ptr(page), 0);
+}
+
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
+int PageHuge(struct page *page)
+{
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(PageHuge);
+
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+ if (!PageHead(page_head))
+ return 0;
+
+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which means that page_mapping() is
+ * stable. Due to locking order, we can only trylock_write. If we can
+ * not get the lock, simply return NULL to caller.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+ struct address_space *mapping = page_mapping(hpage);
+
+ if (!mapping)
+ return mapping;
+
+ if (i_mmap_trylock_write(mapping))
+ return mapping;
+
+ return NULL;
+}
+
+pgoff_t hugetlb_basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
+{
+ int order = huge_page_order(h);
+ struct page *page;
+ bool alloc_try_hard = true;
+
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+ if (page)
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
+ return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ else
+ page = alloc_buddy_huge_page(h, gfp_mask,
+ nid, nmask, node_alloc_noretry);
+ if (!page)
+ return NULL;
+
+ if (hstate_is_gigantic(h))
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, page_to_nid(page));
+
+ return page;
+}
+
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
+{
+ struct page *page;
+ int nr_nodes, node;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
+ if (page)
+ break;
+ }
+
+ if (!page)
+ return 0;
+
+ put_page(page); /* free it into the hugepage allocator */
+
+ return 1;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ bool acct_surplus)
+{
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+ !list_empty(&h->hugepage_freelists[node])) {
+ struct page *page =
+ list_entry(h->hugepage_freelists[node].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[node]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[node]--;
+ }
+ update_and_free_page(h, page);
+ ret = 1;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use hugepages and non-hugepages.
+ * This function returns values like below:
+ *
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
+ */
+int dissolve_free_huge_page(struct page *page)
+{
+ int rc = -EBUSY;
+
+retry:
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!PageHuge(page))
+ return 0;
+
+ spin_lock(&hugetlb_lock);
+ if (!PageHuge(page)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (!page_count(page)) {
+ struct page *head = compound_head(page);
+ struct hstate *h = page_hstate(head);
+ int nid = page_to_nid(head);
+ if (h->free_huge_pages - h->resv_huge_pages == 0)
+ goto out;
+
+ /*
+ * We should make sure that the page is already on the free list
+ * when it is dissolved.
+ */
+ if (unlikely(!PageHugeFreed(head))) {
+ spin_unlock(&hugetlb_lock);
+ cond_resched();
+
+ /*
+ * Theoretically, we should return -EBUSY when we
+ * encounter this race. In fact, we have a chance
+ * to successfully dissolve the page if we do a
+ * retry. Because the race window is quite small.
+ * If we seize this opportunity, it is an optimization
+ * for increasing the success rate of dissolving page.
+ */
+ goto retry;
+ }
+
+ /*
+ * Move PageHWPoison flag from head page to the raw error page,
+ * which makes any subpages rather than the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
+ list_del(&head->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ h->max_huge_pages--;
+ update_and_free_page(h, head);
+ rc = 0;
+ }
+out:
+ spin_unlock(&hugetlb_lock);
+ return rc;
+}
+
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
+ * Also note that if dissolve_free_huge_page() returns with an error, all
+ * free hugepages that were dissolved before that error are lost.
+ */
+int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct page *page;
+ int rc = 0;
+
+ if (!hugepages_supported())
+ return rc;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ page = pfn_to_page(pfn);
+ rc = dissolve_free_huge_page(page);
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
+{
+ struct page *page = NULL;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+ goto out_unlock;
+ spin_unlock(&hugetlb_lock);
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
+ if (!page)
+ return NULL;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * We could have raced with the pool size change.
+ * Double check that and simply deallocate the new page
+ * if we would end up overcommiting the surpluses. Abuse
+ * temporary page to workaround the nasty free_huge_page
+ * codeflow
+ */
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
+ SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
+ put_page(page);
+ return NULL;
+ } else {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[page_to_nid(page)]++;
+ }
+
+out_unlock:
+ spin_unlock(&hugetlb_lock);
+
+ return page;
+}
+
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
+{
+ struct page *page;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
+ if (!page)
+ return NULL;
+
+ /*
+ * We do not account these pages as surplus because they are only
+ * temporary and will be released properly on the last reference
+ */
+ SetPageHugeTemporary(page);
+
+ return page;
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ struct mempolicy *mpol;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+ int nid;
+ nodemask_t *nodemask;
+
+ nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
+}
+
+/* page migration callback function */
+struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+ nodemask_t *nmask, gfp_t gfp_mask)
+{
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ struct page *page;
+
+ page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
+ if (page) {
+ spin_unlock(&hugetlb_lock);
+ return page;
+ }
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct page *page;
+ gfp_t gfp_mask;
+ int node;
+
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
+ mpol_cond_put(mpol);
+
+ return page;
+}
+
+/*
+ * Increase the hugetlb pool such that it can accommodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(struct hstate *h, int delta)
+ __must_hold(&hugetlb_lock)
+{
+ struct list_head surplus_list;
+ struct page *page, *tmp;
+ int ret, i;
+ int needed, allocated;
+ bool alloc_ok = true;
+
+ needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
+ if (needed <= 0) {
+ h->resv_huge_pages += delta;
+ return 0;
+ }
+
+ allocated = 0;
+ INIT_LIST_HEAD(&surplus_list);
+
+ ret = -ENOMEM;
+retry:
+ spin_unlock(&hugetlb_lock);
+ for (i = 0; i < needed; i++) {
+ page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, NULL);
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
+ list_add(&page->lru, &surplus_list);
+ cond_resched();
+ }
+ allocated += i;
+
+ /*
+ * After retaking hugetlb_lock, we need to recalculate 'needed'
+ * because either resv_huge_pages or free_huge_pages may have changed.
+ */
+ spin_lock(&hugetlb_lock);
+ needed = (h->resv_huge_pages + delta) -
+ (h->free_huge_pages + allocated);
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
+ /*
+ * The surplus_list now contains _at_least_ the number of extra pages
+ * needed to accommodate the reservation. Add the appropriate number
+ * of pages to the hugetlb pool and free the extras back to the buddy
+ * allocator. Commit the entire reservation here to prevent another
+ * process from stealing the pages as they are added to the pool but
+ * before they are reserved.
+ */
+ needed += allocated;
+ h->resv_huge_pages += delta;
+ ret = 0;
+
+ /* Free the needed pages to the hugetlb pool */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ if ((--needed) < 0)
+ break;
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ enqueue_huge_page(h, page);
+ }
+free:
+ spin_unlock(&hugetlb_lock);
+
+ /* Free unnecessary surplus pages to the buddy allocator */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru)
+ put_page(page);
+ spin_lock(&hugetlb_lock);
+
+ return ret;
+}
+
+/*
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
+ */
+static void return_unused_surplus_pages(struct hstate *h,
+ unsigned long unused_resv_pages)
+{
+ unsigned long nr_pages;
+
+ /* Cannot return gigantic pages currently */
+ if (hstate_is_gigantic(h))
+ goto out;
+
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
+ nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes with memory. Iterate across these nodes
+ * until we can no longer free unreserved surplus pages. This occurs
+ * when the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the freed pages across the
+ * on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
+ */
+ while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ goto out;
+ cond_resched_lock(&hugetlb_lock);
+ }
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
+}
+
+
+/*
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation. If a reservation is
+ * needed, the value 1 is returned. The caller is then responsible for
+ * managing the global reservation and subpool usage counts. After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map. If the page allocation fails,
+ * the reservation must be ended instead of committed. vma_end_reservation
+ * is called in such cases.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call. The only time this
+ * is not the case is if a reserve map was changed between calls. It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
+ *
+ * vma_add_reservation is used in error paths where a reservation must
+ * be restored when a newly allocated huge page must be freed. It is
+ * to be called after calling vma_needs_reservation to determine if a
+ * reservation exists.
+ */
+enum vma_resv_mode {
+ VMA_NEEDS_RESV,
+ VMA_COMMIT_RESV,
+ VMA_END_RESV,
+ VMA_ADD_RESV,
+};
+static long __vma_reservation_common(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr,
+ enum vma_resv_mode mode)
+{
+ struct resv_map *resv;
+ pgoff_t idx;
+ long ret;
+ long dummy_out_regions_needed;
+
+ resv = vma_resv_map(vma);
+ if (!resv)
+ return 1;
+
+ idx = vma_hugecache_offset(h, vma, addr);
+ switch (mode) {
+ case VMA_NEEDS_RESV:
+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
+ /* We assume that vma_reservation_* routines always operate on
+ * 1 page, and that adding to resv map a 1 page entry can only
+ * ever require 1 region.
+ */
+ VM_BUG_ON(dummy_out_regions_needed != 1);
+ break;
+ case VMA_COMMIT_RESV:
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ break;
+ case VMA_END_RESV:
+ region_abort(resv, idx, idx + 1, 1);
+ ret = 0;
+ break;
+ case VMA_ADD_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ } else {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ }
+ break;
+ default:
+ BUG();
+ }
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ return ret;
+ else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
+ /*
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret)
+ return 0;
+ else
+ return 1;
+ }
+ else
+ return ret < 0 ? ret : 0;
+}
+
+static long vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
+}
+
+static long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
+
+static void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
+}
+
+static long vma_add_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
+}
+
+/*
+ * This routine is called to restore a reservation on error paths. In the
+ * specific error paths, a huge page was allocated (via alloc_huge_page)
+ * and is about to be freed. If a reservation for the page existed,
+ * alloc_huge_page would have consumed the reservation and set PagePrivate
+ * in the newly allocated page. When the page is freed via free_huge_page,
+ * the global reservation count will be incremented if PagePrivate is set.
+ * However, free_huge_page can not adjust the reserve map. Adjust the
+ * reserve map here to be consistent with global reserve count adjustments
+ * to be made by free_huge_page.
+ */
+static void restore_reserve_on_error(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address,
+ struct page *page)
+{
+ if (unlikely(PagePrivate(page))) {
+ long rc = vma_needs_reservation(h, vma, address);
+
+ if (unlikely(rc < 0)) {
+ /*
+ * Rare out of memory condition in reserve map
+ * manipulation. Clear PagePrivate so that
+ * global reserve count will not be incremented
+ * by free_huge_page. This will make it appear
+ * as though the reservation for this page was
+ * consumed. This may prevent the task from
+ * faulting in the page at a later time. This
+ * is better than inconsistent global huge page
+ * accounting of reserve counts.
+ */
+ ClearPagePrivate(page);
+ } else if (rc) {
+ rc = vma_add_reservation(h, vma, address);
+ if (unlikely(rc < 0))
+ /*
+ * See above comment about rare out of
+ * memory condition.
+ */
+ ClearPagePrivate(page);
+ } else
+ vma_end_reservation(h, vma, address);
+ }
+}
+
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ struct hstate *h = hstate_vma(vma);
+ struct page *page;
+ long map_chg, map_commit;
+ long gbl_chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ bool deferred_reserve;
+
+ idx = hstate_index(h);
+ /*
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
+ */
+ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+ if (map_chg < 0)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves as indicated by the region/reserve map. Check
+ * that the allocation will not exceed the subpool limit.
+ * Allocations for MAP_NORESERVE mappings also need to be
+ * checked against any subpool limit.
+ */
+ if (map_chg || avoid_reserve) {
+ gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ if (gbl_chg < 0) {
+ vma_end_reservation(h, vma, addr);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ /*
+ * Even though there was no reservation in the region/reserve
+ * map, there could be reservations associated with the
+ * subpool that can be used. This would be indicated if the
+ * return value of hugepage_subpool_get_pages() is zero.
+ * However, if avoid_reserve is specified we still avoid even
+ * the subpool reservations.
+ */
+ if (avoid_reserve)
+ gbl_chg = 1;
+ }
+
+ /* If this allocation is not consuming a reservation, charge it now.
+ */
+ deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ if (deferred_reserve) {
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_subpool_put;
+ }
+
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_uncharge_cgroup_reservation;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * glb_chg is passed to indicate whether or not a page must be taken
+ * from the global free pool (global change). gbl_chg == 0 indicates
+ * a reservation exists for the allocation.
+ */
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ if (!page) {
+ spin_unlock(&hugetlb_lock);
+ page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
+ if (!page)
+ goto out_uncharge_cgroup;
+ spin_lock(&hugetlb_lock);
+ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
+ }
+ list_add(&page->lru, &h->hugepage_activelist);
+ /* Fall through */
+ }
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ /* If allocation is not consuming a reservation, also store the
+ * hugetlb_cgroup pointer on the page.
+ */
+ if (deferred_reserve) {
+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
+ h_cg, page);
+ }
+
+ spin_unlock(&hugetlb_lock);
+
+ set_page_private(page, (unsigned long)spool);
+
+ map_commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(map_chg > map_commit)) {
+ /*
+ * The page was added to the reservation map between
+ * vma_needs_reservation and vma_commit_reservation.
+ * This indicates a race with hugetlb_reserve_pages.
+ * Adjust for the subpool count incremented above AND
+ * in hugetlb_reserve_pages for the same page. Also,
+ * the reservation count added in hugetlb_reserve_pages
+ * no longer applies.
+ */
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
+ }
+ return page;
+
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_uncharge_cgroup_reservation:
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
+ h_cg);
+out_subpool_put:
+ if (map_chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ vma_end_reservation(h, vma, addr);
+ return ERR_PTR(-ENOSPC);
+}
+
+int alloc_bootmem_huge_page(struct hstate *h)
+ __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
+{
+ struct huge_bootmem_page *m;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+ void *addr;
+
+ addr = memblock_alloc_try_nid_raw(
+ huge_page_size(h), huge_page_size(h),
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ if (addr) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ m = addr;
+ goto found;
+ }
+ }
+ return 0;
+
+found:
+ BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
+ /* Put them into a private list first because mem_map is not up yet */
+ INIT_LIST_HEAD(&m->list);
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
+ return 1;
+}
+
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
+static void __init gather_bootmem_prealloc(void)
+{
+ struct huge_bootmem_page *m;
+
+ list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
+ struct hstate *h = m->hstate;
+
+ VM_BUG_ON(!hstate_is_gigantic(h));
+ WARN_ON(page_count(page) != 1);
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* free it into the hugepage allocator */
+
+ /*
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
+ */
+ adjust_managed_page_count(page, pages_per_huge_page(h));
+ cond_resched();
+ }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+ unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ goto free;
+ }
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_pool_huge_page(h,
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
+ break;
+ cond_resched();
+ }
+ if (i < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, i);
+ h->max_huge_pages = i;
+ }
+free:
+ kfree(node_alloc_noretry);
+}
+
+static void __init hugetlb_init_hstates(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (minimum_order > huge_page_order(h))
+ minimum_order = huge_page_order(h);
+
+ /* oversize hugepages were init'ed in early boot */
+ if (!hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+ VM_BUG_ON(minimum_order == UINT_MAX);
+}
+
+static void __init report_hugepages(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ buf, h->free_huge_pages);
+ }
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+ int i;
+
+ if (hstate_is_gigantic(h))
+ return;
+
+ for_each_node_mask(i, *nodes_allowed) {
+ struct page *page, *next;
+ struct list_head *freel = &h->hugepage_freelists[i];
+ list_for_each_entry_safe(page, next, freel, lru) {
+ if (count >= h->nr_huge_pages)
+ return;
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[page_to_nid(page)]--;
+ }
+ }
+}
+#else
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+}
+#endif
+
+/*
+ * Increment or decrement surplus_huge_pages. Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+ int delta)
+{
+ int nr_nodes, node;
+
+ VM_BUG_ON(delta != -1 && delta != 1);
+
+ if (delta < 0) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node])
+ goto found;
+ }
+ } else {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] <
+ h->nr_huge_pages_node[node])
+ goto found;
+ }
+ }
+ return 0;
+
+found:
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[node] += delta;
+ return 1;
+}
+
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
+{
+ unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
+
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
+
+ /*
+ * Increase the pool size
+ * First take pages out of surplus state. Then make up the
+ * remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_surplus_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
+ */
+ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, -1))
+ break;
+ }
+
+ while (count > persistent_huge_pages(h)) {
+ /*
+ * If this allocation races such that we no longer need the
+ * page, free_huge_page will handle it by freeing the page
+ * and reducing the surplus.
+ */
+ spin_unlock(&hugetlb_lock);
+
+ /* yield cpu to avoid soft lockup */
+ cond_resched();
+
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
+ spin_lock(&hugetlb_lock);
+ if (!ret)
+ goto out;
+
+ /* Bail for signals. Probably ctrl-c from user */
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /*
+ * Decrease the pool size
+ * First return free pages to the buddy allocator (being careful
+ * to keep enough around to satisfy reservations). Then place
+ * pages into surplus state as needed so the pool will shrink
+ * to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_surplus_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
+ */
+ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ min_count = max(count, min_count);
+ try_to_free_low(h, min_count, nodes_allowed);
+ while (min_count < persistent_huge_pages(h)) {
+ if (!free_pool_huge_page(h, nodes_allowed, 0))
+ break;
+ cond_resched_lock(&hugetlb_lock);
+ }
+ while (count < persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, 1))
+ break;
+ }
+out:
+ h->max_huge_pages = persistent_huge_pages(h);
+ spin_unlock(&hugetlb_lock);
+
+ NODEMASK_FREE(node_alloc_noretry);
+
+ return 0;
+}
+
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
+{
+ int i;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = NUMA_NO_NODE;
+ return &hstates[i];
+ }
+
+ return kobj_to_node_hstate(kobj, nidp);
+}
+
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long nr_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ nr_huge_pages = h->nr_huge_pages;
+ else
+ nr_huge_pages = h->nr_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", nr_huge_pages);
+}
+
+static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
+ struct hstate *h, int nid,
+ unsigned long count, size_t len)
+{
+ int err;
+ nodemask_t nodes_allowed, *n_mask;
+
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
+
+ if (nid == NUMA_NO_NODE) {
+ /*
+ * global hstate attribute
+ */
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
+ /*
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
+ */
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
+
+ err = set_max_huge_pages(h, count, nid, n_mask);
+
+ return err ? err : len;
+}
+
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, const char *buf,
+ size_t len)
+{
+ struct hstate *h;
+ unsigned long count;
+ int nid;
+ int err;
+
+ err = kstrtoul(buf, 10, &count);
+ if (err)
+ return err;
+
+ h = kobj_to_hstate(kobj, &nid);
+ return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(false, kobj, buf, len);
+}
+HSTATE_ATTR(nr_hugepages);
+
+#ifdef CONFIG_NUMA
+
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(true, kobj, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
+
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+
+ if (hstate_is_gigantic(h))
+ return -EINVAL;
+
+ err = kstrtoul(buf, 10, &input);
+ if (err)
+ return err;
+
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long free_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ free_huge_pages = h->free_huge_pages;
+ else
+ free_huge_pages = h->free_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long surplus_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ surplus_huge_pages = h->surplus_huge_pages;
+ else
+ surplus_huge_pages = h->surplus_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+ &nr_hugepages_mempolicy_attr.attr,
+#endif
+ NULL,
+};
+
+static const struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ const struct attribute_group *hstate_attr_group)
+{
+ int retval;
+ int hi = hstate_index(h);
+
+ hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
+ if (!hstate_kobjs[hi])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
+ if (retval) {
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ }
+
+ return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ }
+}
+
+#ifdef CONFIG_NUMA
+
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+ struct kobject *hugepages_kobj;
+ struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+static struct node_hstate node_hstates[MAX_NUMNODES];
+
+/*
+ * A subset of global hstate attributes for node devices
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group per_node_hstate_attr_group = {
+ .attrs = per_node_hstate_attrs,
+};
+
+/*
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++) {
+ struct node_hstate *nhs = &node_hstates[nid];
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (nhs->hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = nid;
+ return &hstates[i];
+ }
+ }
+
+ BUG();
+ return NULL;
+}
+
+/*
+ * Unregister hstate attributes from a single node device.
+ * No-op if no hstate attributes attached.
+ */
+static void hugetlb_unregister_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+
+ if (!nhs->hugepages_kobj)
+ return; /* no hstate attributes */
+
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
+ }
+ }
+
+ kobject_put(nhs->hugepages_kobj);
+ nhs->hugepages_kobj = NULL;
+}
+
+
+/*
+ * Register hstate attributes for a single node device.
+ * No-op if attributes already registered.
+ */
+static void hugetlb_register_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+ int err;
+
+ if (nhs->hugepages_kobj)
+ return; /* already allocated */
+
+ nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+ &node->dev.kobj);
+ if (!nhs->hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+ nhs->hstate_kobjs,
+ &per_node_hstate_attr_group);
+ if (err) {
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
+ h->name, node->dev.id);
+ hugetlb_unregister_node(node);
+ break;
+ }
+ }
+}
+
+/*
+ * hugetlb init time: register hstate attributes for all registered node
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
+ */
+static void __init hugetlb_register_all_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
+ if (node->dev.id == nid)
+ hugetlb_register_node(node);
+ }
+
+ /*
+ * Let the node device driver know we're here so it can
+ * [un]register hstate attributes on node hotplug.
+ */
+ register_hugetlbfs_with_node(hugetlb_register_node,
+ hugetlb_unregister_node);
+}
+#else /* !CONFIG_NUMA */
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ BUG();
+ if (nidp)
+ *nidp = -1;
+ return NULL;
+}
+
+static void hugetlb_register_all_nodes(void) { }
+
+#endif
+
+static int __init hugetlb_init(void)
+{
+ int i;
+
+ if (!hugepages_supported()) {
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
+ return 0;
+ }
+
+ /*
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
+ * architectures depend on setup being done here.
+ */
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ if (!parsed_default_hugepagesz) {
+ /*
+ * If we did not parse a default huge page size, set
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
+ * number of huge pages for this default size was implicitly
+ * specified, set that here as well.
+ * Note that the implicit setting will overwrite an explicit
+ * setting. A warning will be printed in this case.
+ */
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
+ if (default_hstate_max_huge_pages) {
+ if (default_hstate.max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(&default_hstate),
+ 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
+ default_hstate.max_huge_pages, buf);
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
+ default_hstate_max_huge_pages);
+ }
+ default_hstate.max_huge_pages =
+ default_hstate_max_huge_pages;
+ }
+ }
+
+ hugetlb_cma_check();
+ hugetlb_init_hstates();
+ gather_bootmem_prealloc();
+ report_hugepages();
+
+ hugetlb_sysfs_init();
+ hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
+
+#ifdef CONFIG_SMP
+ num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
+#else
+ num_fault_mutexes = 1;
+#endif
+ hugetlb_fault_mutex_table =
+ kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
+ GFP_KERNEL);
+ BUG_ON(!hugetlb_fault_mutex_table);
+
+ for (i = 0; i < num_fault_mutexes; i++)
+ mutex_init(&hugetlb_fault_mutex_table[i]);
+ return 0;
+}
+subsys_initcall(hugetlb_init);
+
+/* Overwritten by architectures with more huge page sizes */
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
+{
+ return size == HPAGE_SIZE;
+}
+
+void __init hugetlb_add_hstate(unsigned int order)
+{
+ struct hstate *h;
+ unsigned long i;
+
+ if (size_to_hstate(PAGE_SIZE << order)) {
+ return;
+ }
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(order == 0);
+ h = &hstates[hugetlb_max_hstate++];
+ h->order = order;
+ h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->nr_huge_pages = 0;
+ h->free_huge_pages = 0;
+ for (i = 0; i < MAX_NUMNODES; ++i)
+ INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
+ snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+ huge_page_size(h)/1024);
+
+ parsed_hstate = h;
+}
+
+/*
+ * hugepages command line processing
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
+ * specification. If not, ignore the hugepages value. hugepages can also
+ * be the first huge page command line option in which case it implicitly
+ * specifies the number of huge pages for the default size.
+ */
+static int __init hugepages_setup(char *s)
+{
+ unsigned long *mhp;
+ static unsigned long *last_mhp;
+
+ if (!parsed_valid_hugepagesz) {
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
+ parsed_valid_hugepagesz = true;
+ return 0;
+ }
+
+ /*
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
+ * yet, so this hugepages= parameter goes to the "default hstate".
+ * Otherwise, it goes with the previously parsed hugepagesz or
+ * default_hugepagesz.
+ */
+ else if (!hugetlb_max_hstate)
+ mhp = &default_hstate_max_huge_pages;
+ else
+ mhp = &parsed_hstate->max_huge_pages;
+
+ if (mhp == last_mhp) {
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
+ return 0;
+ }
+
+ if (sscanf(s, "%lu", mhp) <= 0)
+ *mhp = 0;
+
+ /*
+ * Global state is always initialized later in hugetlb_init.
+ * But we need to allocate >= MAX_ORDER hstates here early to still
+ * use the bootmem allocator.
+ */
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ hugetlb_hstate_alloc_pages(parsed_hstate);
+
+ last_mhp = mhp;
+
+ return 1;
+}
+__setup("hugepages=", hugepages_setup);
+
+/*
+ * hugepagesz command line processing
+ * A specific huge page size can only be specified once with hugepagesz.
+ * hugepagesz is followed by hugepages on the command line. The global
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
+ * hugepagesz argument was valid.
+ */
+static int __init hugepagesz_setup(char *s)
+{
+ unsigned long size;
+ struct hstate *h;
+
+ parsed_valid_hugepagesz = false;
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ h = size_to_hstate(size);
+ if (h) {
+ /*
+ * hstate for this size already exists. This is normally
+ * an error, but is allowed if the existing hstate is the
+ * default hstate. More specifically, it is only allowed if
+ * the number of huge pages for the default hstate was not
+ * previously specified.
+ */
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
+ default_hstate.max_huge_pages) {
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
+ return 0;
+ }
+
+ /*
+ * No need to call hugetlb_add_hstate() as hstate already
+ * exists. But, do set parsed_hstate so that a following
+ * hugepages= parameter will be applied to this hstate.
+ */
+ parsed_hstate = h;
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ return 1;
+}
+__setup("hugepagesz=", hugepagesz_setup);
+
+/*
+ * default_hugepagesz command line input
+ * Only one instance of default_hugepagesz allowed on command line.
+ */
+static int __init default_hugepagesz_setup(char *s)
+{
+ unsigned long size;
+
+ parsed_valid_hugepagesz = false;
+ if (parsed_default_hugepagesz) {
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
+ return 0;
+ }
+
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ parsed_default_hugepagesz = true;
+ default_hstate_idx = hstate_index(size_to_hstate(size));
+
+ /*
+ * The number of default huge pages (for this size) could have been
+ * specified as the first hugetlb parameter: hugepages=X. If so,
+ * then default_hstate_max_huge_pages is set. If the default huge
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * allocated here from bootmem allocator.
+ */
+ if (default_hstate_max_huge_pages) {
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (hstate_is_gigantic(&default_hstate))
+ hugetlb_hstate_alloc_pages(&default_hstate);
+ default_hstate_max_huge_pages = 0;
+ }
+
+ return 1;
+}
+__setup("default_hugepagesz=", default_hugepagesz_setup);
+
+static unsigned int allowed_mems_nr(struct hstate *h)
+{
+ int node;
+ unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
+
+ return nr;
+}
+
+#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp = h->max_huge_pages;
+ int ret;
+
+ if (!hugepages_supported())
+ return -EOPNOTSUPP;
+
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
+ if (ret)
+ goto out;
+
+ if (write)
+ ret = __nr_hugepages_store_common(obey_mempolicy, h,
+ NUMA_NO_NODE, tmp, *length);
+out:
+ return ret;
+}
+
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+
+ return hugetlb_sysctl_handler_common(false, table, write,
+ buffer, length, ppos);
+}
+
+#ifdef CONFIG_NUMA
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ return hugetlb_sysctl_handler_common(true, table, write,
+ buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
+
+int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ if (!hugepages_supported())
+ return -EOPNOTSUPP;
+
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && hstate_is_gigantic(h))
+ return -EINVAL;
+
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
+ if (ret)
+ goto out;
+
+ if (write) {
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock(&hugetlb_lock);
+ }
+out:
+ return ret;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+void hugetlb_report_meminfo(struct seq_file *m)
+{
+ struct hstate *h;
+ unsigned long total = 0;
+
+ if (!hugepages_supported())
+ return;
+
+ for_each_hstate(h) {
+ unsigned long count = h->nr_huge_pages;
+
+ total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+ if (h == &default_hstate)
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ count,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ (PAGE_SIZE << huge_page_order(h)) / 1024);
+ }
+
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
+}
+
+int hugetlb_report_node_meminfo(char *buf, int len, int nid)
+{
+ struct hstate *h = &default_hstate;
+
+ if (!hugepages_supported())
+ return 0;
+
+ return sysfs_emit_at(buf, len,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
+}
+
+void hugetlb_show_meminfo(void)
+{
+ struct hstate *h;
+ int nid;
+
+ if (!hugepages_supported())
+ return;
+
+ for_each_node_state(nid, N_MEMORY)
+ for_each_hstate(h)
+ pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+ atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+ struct hstate *h;
+ unsigned long nr_total_pages = 0;
+
+ for_each_hstate(h)
+ nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+ return nr_total_pages;
+}
+
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
+ */
+ if (delta > 0) {
+ if (gather_surplus_pages(h, delta) < 0)
+ goto out;
+
+ if (delta > allowed_mems_nr(h)) {
+ return_unused_surplus_pages(h, delta);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ if (delta < 0)
+ return_unused_surplus_pages(h, (unsigned long) -delta);
+
+out:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *resv = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot disappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
+ kref_get(&resv->refs);
+ }
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct resv_map *resv = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ unsigned long reserve, start, end;
+ long gbl_reserve;
+
+ if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return;
+
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
+
+ reserve = (end - start) - region_count(resv, start, end);
+ hugetlb_cgroup_uncharge_counter(resv, start, end);
+ if (reserve) {
+ /*
+ * Decrement reserve counts. The global reserve count may be
+ * adjusted if the subpool has a minimum size.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+ kref_put(&resv->refs, resv_map_release);
+}
+
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (addr & ~(huge_page_mask(hstate_vma(vma))))
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate = hstate_vma(vma);
+
+ return 1UL << huge_page_shift(hstate);
+}
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all. They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+
+/*
+ * When a new function is introduced to vm_operations_struct and added
+ * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
+ * This is because under System V memory model, mappings created via
+ * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
+ * their original vm_ops are overwritten with shm_vm_ops.
+ */
+const struct vm_operations_struct hugetlb_vm_ops = {
+ .fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
+ .close = hugetlb_vm_op_close,
+ .split = hugetlb_vm_op_split,
+ .pagesize = hugetlb_vm_op_pagesize,
+};
+
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
+{
+ pte_t entry;
+
+ if (writable) {
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
+ vma->vm_page_prot)));
+ } else {
+ entry = huge_pte_wrprotect(mk_huge_pte(page,
+ vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
+
+ return entry;
+}
+
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
+ update_mmu_cache(vma, address, ptep);
+}
+
+bool is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return false;
+ swp = pte_to_swp_entry(pte);
+ if (is_migration_entry(swp))
+ return true;
+ else
+ return false;
+}
+
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return false;
+ swp = pte_to_swp_entry(pte);
+ if (is_hwpoison_entry(swp))
+ return true;
+ else
+ return false;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry, dst_entry;
+ struct page *ptepage;
+ unsigned long addr;
+ int cow;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct mmu_notifier_range range;
+ int ret = 0;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+
+ if (cow) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
+ vma->vm_end);
+ mmu_notifier_invalidate_range_start(&range);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
+ }
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ spinlock_t *src_ptl, *dst_ptl;
+ src_pte = huge_pte_offset(src, addr, sz);
+ if (!src_pte)
+ continue;
+ dst_pte = huge_pte_alloc(dst, addr, sz);
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /*
+ * If the pagetables are shared don't copy or take references.
+ * dst_pte == src_pte is the common case of src/dest sharing.
+ *
+ * However, src could have 'unshared' and dst shares with
+ * another vma. If dst_pte !none, this implies sharing.
+ * Check here before taking page table lock, and once again
+ * after taking the lock below.
+ */
+ dst_entry = huge_ptep_get(dst_pte);
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+ continue;
+
+ dst_ptl = huge_pte_lock(h, dst, dst_pte);
+ src_ptl = huge_pte_lockptr(h, src, src_pte);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ entry = huge_ptep_get(src_pte);
+ dst_entry = huge_ptep_get(dst_pte);
+ if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ /*
+ * Skip if src entry none. Also, skip in the
+ * unlikely case dst entry !none as this implies
+ * sharing with another vma.
+ */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_swap_pte_at(src, addr, src_pte,
+ entry, sz);
+ }
+ set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ } else {
+ if (cow) {
+ /*
+ * No need to notify as we are downgrading page
+ * table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ huge_ptep_set_wrprotect(src, addr, src_pte);
+ }
+ entry = huge_ptep_get(src_pte);
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ page_dup_rmap(ptepage, true);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ hugetlb_count_add(pages_per_huge_page(h), dst);
+ }
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ }
+
+ if (cow)
+ mmu_notifier_invalidate_range_end(&range);
+ else
+ i_mmap_unlock_read(mapping);
+
+ return ret;
+}
+
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *ptep;
+ pte_t pte;
+ spinlock_t *ptl;
+ struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ struct mmu_notifier_range range;
+ bool force_flush = false;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
+
+ /*
+ * This is a hugetlb vma, all the pte entries should point
+ * to huge page.
+ */
+ tlb_change_page_size(tlb, sz);
+ tlb_start_vma(tlb, vma);
+
+ /*
+ * If sharing possible, alert mmu notifiers of worst case.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ address = start;
+ for (; address < end; address += sz) {
+ ptep = huge_pte_offset(mm, address, sz);
+ if (!ptep)
+ continue;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ spin_unlock(ptl);
+ tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+ force_flush = true;
+ continue;
+ }
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte)) {
+ spin_unlock(ptl);
+ continue;
+ }
+
+ /*
+ * Migrating hugepage or HWPoisoned hugepage is already
+ * unmapped and its refcount is dropped, so just clear pte here.
+ */
+ if (unlikely(!pte_present(pte))) {
+ huge_pte_clear(mm, address, ptep, sz);
+ spin_unlock(ptl);
+ continue;
+ }
+
+ page = pte_page(pte);
+ /*
+ * If a reference page is supplied, it is because a specific
+ * page is being unmapped, not a range. Ensure the page we
+ * are about to unmap is the actual page of interest.
+ */
+ if (ref_page) {
+ if (page != ref_page) {
+ spin_unlock(ptl);
+ continue;
+ }
+ /*
+ * Mark the VMA as having unmapped its page so that
+ * future faults in this VMA will fail rather than
+ * looking like data was lost
+ */
+ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ }
+
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
+ if (huge_pte_dirty(pte))
+ set_page_dirty(page);
+
+ hugetlb_count_sub(pages_per_huge_page(h), mm);
+ page_remove_rmap(page, true);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, huge_page_size(h));
+ /*
+ * Bail out after unmapping reference page if supplied
+ */
+ if (ref_page)
+ break;
+ }
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_end_vma(tlb, vma);
+
+ /*
+ * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+ * could defer the flush until now, since by holding i_mmap_rwsem we
+ * guaranteed that the last refernece would not be dropped. But we must
+ * do the flushing before we return, as otherwise i_mmap_rwsem will be
+ * dropped and the last reference to the shared PMDs page might be
+ * dropped as well.
+ *
+ * In theory we could defer the freeing of the PMD pages as well, but
+ * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+ * detect sharing, so we cannot defer the release of the page either.
+ * Instead, do flush now.
+ */
+ if (force_flush)
+ tlb_flush_mmu_tlbonly(tlb);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_rwsem. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_rwsem is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+ unsigned long tlb_start = start;
+ unsigned long tlb_end = end;
+
+ /*
+ * If shared PMDs were possibly used within this vma range, adjust
+ * start/end for worst case tlb flushing.
+ * Note that we can not be sure if PMDs are shared until we try to
+ * unmap pages. However, we want to make sure TLB flushing covers
+ * the largest possible range.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+}
+
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, unsigned long address)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct vm_area_struct *iter_vma;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ /*
+ * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+ * from page cache lookup which is in HPAGE_SIZE units.
+ */
+ address = address & huge_page_mask(h);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ mapping = vma->vm_file->f_mapping;
+
+ /*
+ * Take the mapping lock for the duration of the table walk. As
+ * this mapping should be shared between all the VMAs,
+ * __unmap_hugepage_range() is called as the lock is already held
+ */
+ i_mmap_lock_write(mapping);
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
+ /* Do not unmap the current VMA */
+ if (iter_vma == vma)
+ continue;
+
+ /*
+ * Shared VMAs have their own reserves and do not affect
+ * MAP_PRIVATE accounting but it is possible that a shared
+ * VMA is using the same page so check and skip such VMAs.
+ */
+ if (iter_vma->vm_flags & VM_MAYSHARE)
+ continue;
+
+ /*
+ * Unmap the page from other VMAs without their own reserves.
+ * They get marked to be SIGKILLed if they fault in these
+ * areas. This is because a future no-page fault on this VMA
+ * could insert a zeroed page instead of the data existing
+ * from the time of fork. This would look like data corruption
+ */
+ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
+ }
+ i_mmap_unlock_write(mapping);
+}
+
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
+ */
+static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ struct page *pagecache_page, spinlock_t *ptl)
+{
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+ struct page *old_page, *new_page;
+ int outside_reserve = 0;
+ vm_fault_t ret = 0;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct mmu_notifier_range range;
+
+ pte = huge_ptep_get(ptep);
+ old_page = pte_page(pte);
+
+retry_avoidcopy:
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+ page_move_anon_rmap(old_page, vma);
+ set_huge_ptep_writable(vma, haddr, ptep);
+ return 0;
+ }
+
+ /*
+ * If the process that created a MAP_PRIVATE mapping is about to
+ * perform a COW due to a shared page count, attempt to satisfy
+ * the allocation without using the existing reserves. The pagecache
+ * page is used to determine if the reserve at this address was
+ * consumed or not. If reserves were used, a partial faulted mapping
+ * at the time of fork() could consume its reserves on COW instead
+ * of the full address range.
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ old_page != pagecache_page)
+ outside_reserve = 1;
+
+ get_page(old_page);
+
+ /*
+ * Drop page table lock as buddy allocator may be called. It will
+ * be acquired again before returning to the caller, as expected.
+ */
+ spin_unlock(ptl);
+ new_page = alloc_huge_page(vma, haddr, outside_reserve);
+
+ if (IS_ERR(new_page)) {
+ /*
+ * If a process owning a MAP_PRIVATE mapping fails to COW,
+ * it is due to references held by a child and an insufficient
+ * huge page pool. To guarantee the original mappers
+ * reliability, unmap the page from child processes. The child
+ * may get SIGKILLed if it later faults.
+ */
+ if (outside_reserve) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx;
+ u32 hash;
+
+ put_page(old_page);
+ BUG_ON(huge_pte_none(pte));
+ /*
+ * Drop hugetlb_fault_mutex and i_mmap_rwsem before
+ * unmapping. unmapping needs to hold i_mmap_rwsem
+ * in write mode. Dropping i_mmap_rwsem in read mode
+ * here is OK as COW mappings do not interact with
+ * PMD sharing.
+ *
+ * Reacquire both after unmap operation.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+
+ unmap_ref_private(mm, vma, old_page, haddr);
+
+ i_mmap_lock_read(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ spin_lock(ptl);
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page table
+ * lock, and our job is done.
+ */
+ return 0;
+ }
+
+ ret = vmf_error(PTR_ERR(new_page));
+ goto out_release_old;
+ }
+
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto out_release_all;
+ }
+
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
+ __SetPageUptodate(new_page);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
+ mmu_notifier_invalidate_range_start(&range);
+
+ /*
+ * Retake the page table lock to check for racing updates
+ * before the page tables are altered
+ */
+ spin_lock(ptl);
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+ ClearPagePrivate(new_page);
+
+ /* Break COW */
+ huge_ptep_clear_flush(vma, haddr, ptep);
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
+ set_huge_pte_at(mm, haddr, ptep,
+ make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page, true);
+ hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(&range);
+out_release_all:
+ restore_reserve_on_error(h, vma, haddr, new_page);
+ put_page(new_page);
+out_release_old:
+ put_page(old_page);
+
+ spin_lock(ptl); /* Caller expects lock to be held */
+ return ret;
+}
+
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ return find_lock_page(mapping, idx);
+}
+
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx)
+{
+ struct inode *inode = mapping->host;
+ struct hstate *h = hstate_inode(inode);
+ int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+ if (err)
+ return err;
+ ClearPagePrivate(page);
+
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
+static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping, pgoff_t idx,
+ unsigned long address, pte_t *ptep, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+ int anon_rmap = 0;
+ unsigned long size;
+ struct page *page;
+ pte_t new_pte;
+ spinlock_t *ptl;
+ unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
+
+ /*
+ * Currently, we are forced to kill the process in the event the
+ * original mapper has unmapped pages from the child due to a failed
+ * COW. Warn that such a situation has occurred as it may not be obvious
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+ pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
+ goto out;
+ }
+
+ /*
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * i_size is modified when holding i_mmap_rwsem, so check here
+ * once for faults beyond end of file.
+ */
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (!page) {
+ /*
+ * Check for page in userfault range
+ */
+ if (userfaultfd_missing(vma)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .flags = flags,
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * vma_lock and hugetlb_fault_mutex must be dropped
+ * before handling userfault. Also mmap_lock will
+ * be dropped during handling userfault, any vma
+ * operation should be careful from here.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ return handle_userfault(&vmf, VM_UFFD_MISSING);
+ }
+
+ page = alloc_huge_page(vma, haddr, 0);
+ if (IS_ERR(page)) {
+ /*
+ * Returning error will result in faulting task being
+ * sent SIGBUS. The hugetlb fault mutex prevents two
+ * tasks from racing to fault in the same page which
+ * could result in false unable to allocate errors.
+ * Page migration does not take the fault mutex, but
+ * does a clear then write of pte's under page table
+ * lock. Page fault code could race with migration,
+ * notice the clear pte and try to allocate a page
+ * here. Before returning error, get ptl and make
+ * sure there really is no pte entry.
+ */
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ ret = 0;
+ spin_unlock(ptl);
+ goto out;
+ }
+ spin_unlock(ptl);
+ ret = vmf_error(PTR_ERR(page));
+ goto out;
+ }
+ clear_huge_page(page, address, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ new_page = true;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ int err = huge_add_to_page_cache(page, mapping, idx);
+ if (err) {
+ put_page(page);
+ if (err == -EEXIST)
+ goto retry;
+ goto out;
+ }
+ } else {
+ lock_page(page);
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ anon_rmap = 1;
+ }
+ } else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto backout_unlocked;
+ }
+ }
+
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, haddr);
+ }
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ ret = 0;
+ if (!huge_pte_none(huge_ptep_get(ptep)))
+ goto backout;
+
+ if (anon_rmap) {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, vma, haddr);
+ } else
+ page_dup_rmap(page, true);
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+ set_huge_pte_at(mm, haddr, ptep, new_pte);
+
+ hugetlb_count_add(pages_per_huge_page(h), mm);
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ }
+
+ spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
+ unlock_page(page);
+out:
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ return ret;
+
+backout:
+ spin_unlock(ptl);
+backout_unlocked:
+ unlock_page(page);
+ restore_reserve_on_error(h, vma, haddr, page);
+ put_page(page);
+ goto out;
+}
+
+#ifdef CONFIG_SMP
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
+{
+ unsigned long key[2];
+ u32 hash;
+
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
+
+ hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
+
+ return hash & (num_fault_mutexes - 1);
+}
+#else
+/*
+ * For uniprocesor systems we always use a single mutex, so just
+ * return 0 and avoid the hashing overhead.
+ */
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
+{
+ return 0;
+}
+#endif
+
+vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ pte_t *ptep, entry;
+ spinlock_t *ptl;
+ vm_fault_t ret;
+ u32 hash;
+ pgoff_t idx;
+ struct page *page = NULL;
+ struct page *pagecache_page = NULL;
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping;
+ int need_wait_lock = 0;
+ unsigned long haddr = address & huge_page_mask(h);
+
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait_huge(vma, mm, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ }
+
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with i_size modifications during truncation.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
+
+ /*
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ entry = huge_ptep_get(ptep);
+ if (huge_pte_none(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+
+ ret = 0;
+
+ /*
+ * entry could be a migration/hwpoison entry at this point, so this
+ * check prevents the kernel from going below assuming that we have
+ * an active hugepage in pagecache. This goto expects the 2nd page
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
+ * properly handle it.
+ */
+ if (!pte_present(entry))
+ goto out_mutex;
+
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_mutex;
+ }
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, haddr);
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, haddr);
+ }
+
+ ptl = huge_pte_lock(h, mm, ptep);
+
+ /* Check for a racing update before calling hugetlb_cow */
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_ptl;
+
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ */
+ page = pte_page(entry);
+ if (page != pagecache_page)
+ if (!trylock_page(page)) {
+ need_wait_lock = 1;
+ goto out_ptl;
+ }
+
+ get_page(page);
+
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!huge_pte_write(entry)) {
+ ret = hugetlb_cow(mm, vma, address, ptep,
+ pagecache_page, ptl);
+ goto out_put_page;
+ }
+ entry = huge_pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+ flags & FAULT_FLAG_WRITE))
+ update_mmu_cache(vma, haddr, ptep);
+out_put_page:
+ if (page != pagecache_page)
+ unlock_page(page);
+ put_page(page);
+out_ptl:
+ spin_unlock(ptl);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+out_mutex:
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ /*
+ * Generally it's safe to hold refcount during waiting page lock. But
+ * here we just wait to defer the next page fault to avoid busy loop and
+ * the page is not used after unlocked before returning from the current
+ * page fault. So we are safe from accessing freed page, even if we wait
+ * here without taking refcount.
+ */
+ if (need_wait_lock)
+ wait_on_page_locked(page);
+ return ret;
+}
+
+/*
+ * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ unsigned long size;
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ struct hstate *h = hstate_vma(dst_vma);
+ pte_t _dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_huge_page_from_user(page,
+ (const void __user *) src_addr,
+ pages_per_huge_page(h), false);
+
+ /* fallback to copy_from_user outside mmap_lock */
+ if (unlikely(ret)) {
+ ret = -ENOENT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ mapping = dst_vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ /*
+ * If shared, add to page cache
+ */
+ if (vm_shared) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ ret = -EFAULT;
+ if (idx >= size)
+ goto out_release_nounlock;
+
+ /*
+ * Serialization between remove_inode_hugepages() and
+ * huge_add_to_page_cache() below happens through the
+ * hugetlb_fault_mutex_table that here must be hold by
+ * the caller.
+ */
+ ret = huge_add_to_page_cache(page, mapping, idx);
+ if (ret)
+ goto out_release_nounlock;
+ }
+
+ ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+ spin_lock(ptl);
+
+ /*
+ * Recheck the i_size after holding PT lock to make sure not
+ * to leave any page mapped (as page_mapped()) beyond the end
+ * of the i_size (remove_inode_hugepages() is strict about
+ * enforcing that). If we bail out here, we'll also leave a
+ * page in the radix tree in the vm_shared case beyond the end
+ * of the i_size, but remove_inode_hugepages() will take care
+ * of it as soon as we drop the hugetlb_fault_mutex_table.
+ */
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ ret = -EFAULT;
+ if (idx >= size)
+ goto out_release_unlock;
+
+ ret = -EEXIST;
+ if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ goto out_release_unlock;
+
+ if (vm_shared) {
+ page_dup_rmap(page, true);
+ } else {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ }
+
+ _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
+ _dst_pte = pte_mkyoung(_dst_pte);
+
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+ dst_vma->vm_flags & VM_WRITE);
+ hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ set_page_huge_active(page);
+ if (vm_shared)
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ spin_unlock(ptl);
+ if (vm_shared)
+ unlock_page(page);
+out_release_nounlock:
+ put_page(page);
+ goto out;
+}
+
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags, int *locked)
+{
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
+ unsigned long remainder = *nr_pages;
+ struct hstate *h = hstate_vma(vma);
+ int err = -EFAULT;
+
+ while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ spinlock_t *ptl = NULL;
+ int absent;
+ struct page *page;
+
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (fatal_signal_pending(current)) {
+ remainder = 0;
+ break;
+ }
+
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make sure we get the
+ * first, for the page indexing below to work.
+ *
+ * Note that page table lock is not held when pte is null.
+ */
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+ huge_page_size(h));
+ if (pte)
+ ptl = huge_pte_lock(h, mm, pte);
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ if (pte)
+ spin_unlock(ptl);
+ remainder = 0;
+ break;
+ }
+
+ /*
+ * We need call hugetlb_fault for both hugepages under migration
+ * (in which case hugetlb_fault waits for the migration,) and
+ * hwpoisoned hugepages (in which case we need to prevent the
+ * caller from accessing to them.) In order to do this, we use
+ * here is_swap_pte instead of is_hugetlb_entry_migration and
+ * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+ * both cases, and because we can't follow correct pages
+ * directly from any kind of swap entries.
+ */
+ if (absent || is_swap_pte(huge_ptep_get(pte)) ||
+ ((flags & FOLL_WRITE) &&
+ !huge_pte_write(huge_ptep_get(pte)))) {
+ vm_fault_t ret;
+ unsigned int fault_flags = 0;
+
+ if (pte)
+ spin_unlock(ptl);
+ if (flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_KILLABLE;
+ if (flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT;
+ if (flags & FOLL_TRIED) {
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED can co-exist
+ */
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+ ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ err = vm_fault_to_errno(ret, flags);
+ remainder = 0;
+ break;
+ }
+ if (ret & VM_FAULT_RETRY) {
+ if (locked &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ *locked = 0;
+ *nr_pages = 0;
+ /*
+ * VM_FAULT_RETRY must not return an
+ * error, it will return zero
+ * instead.
+ *
+ * No need to update "position" as the
+ * caller will not check it after
+ * *nr_pages is set to 0.
+ */
+ return i;
+ }
+ continue;
+ }
+
+ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
+ page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * If subpage information not requested, update counters
+ * and skip the same_page loop below.
+ */
+ if (!pages && !vmas && !pfn_offset &&
+ (vaddr + huge_page_size(h) < vma->vm_end) &&
+ (remainder >= pages_per_huge_page(h))) {
+ vaddr += huge_page_size(h);
+ remainder -= pages_per_huge_page(h);
+ i += pages_per_huge_page(h);
+ spin_unlock(ptl);
+ continue;
+ }
+
+same_page:
+ if (pages) {
+ pages[i] = mem_map_offset(page, pfn_offset);
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables. If
+ * the huge page is present, then the tail pages must
+ * also be present. The ptl prevents the head page and
+ * tail pages from being rearranged in any way. So this
+ * page must be available at this point, unless the page
+ * refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++pfn_offset;
+ --remainder;
+ ++i;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < pages_per_huge_page(h)) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ }
+ spin_unlock(ptl);
+ }
+ *nr_pages = remainder;
+ /*
+ * setting position is actually required only if remainder is
+ * not zero but it's faster not to add a "if (remainder)"
+ * branch.
+ */
+ *position = vaddr;
+
+ return i ? i : err;
+}
+
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
+#endif
+
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long start = address;
+ pte_t *ptep;
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
+ bool shared_pmd = false;
+ struct mmu_notifier_range range;
+
+ /*
+ * In the case of shared PMDs, the area to flush could be beyond
+ * start/end. Set range.start/range.end to cover the maximum possible
+ * range if PMD sharing is possible.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+
+ BUG_ON(address >= end);
+ flush_cache_range(vma, range.start, range.end);
+
+ mmu_notifier_invalidate_range_start(&range);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ for (; address < end; address += huge_page_size(h)) {
+ spinlock_t *ptl;
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ if (!ptep)
+ continue;
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ pages++;
+ spin_unlock(ptl);
+ shared_pmd = true;
+ continue;
+ }
+ pte = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ spin_unlock(ptl);
+ continue;
+ }
+ if (unlikely(is_hugetlb_entry_migration(pte))) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (is_write_migration_entry(entry)) {
+ pte_t newpte;
+
+ make_migration_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ set_huge_swap_pte_at(mm, address, ptep,
+ newpte, huge_page_size(h));
+ pages++;
+ }
+ spin_unlock(ptl);
+ continue;
+ }
+ if (!huge_pte_none(pte)) {
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
+ pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
+ pages++;
+ }
+ spin_unlock(ptl);
+ }
+ /*
+ * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_rwsem, another task can do the final put_page
+ * and that page table be reused and filled with junk. If we actually
+ * did unshare a page of pmds, flush the range corresponding to the pud.
+ */
+ if (shared_pmd)
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
+ else
+ flush_hugetlb_tlb_range(vma, start, end);
+ /*
+ * No need to call mmu_notifier_invalidate_range() we are downgrading
+ * page table protection not changing it to point to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ mmu_notifier_invalidate_range_end(&range);
+
+ return pages << h->order;
+}
+
+int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ long ret, chg, add = -1;
+ struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ struct resv_map *resv_map;
+ struct hugetlb_cgroup *h_cg = NULL;
+ long gbl_reserve, regions_needed = 0;
+
+ /* This should never happen */
+ if (from > to) {
+ VM_WARN(1, "%s called with a negative range\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * Only apply hugepage reservation if asked. At fault time, an
+ * attempt will be made for VM_NORESERVE to allocate a page
+ * without using reserves
+ */
+ if (vm_flags & VM_NORESERVE)
+ return 0;
+
+ /*
+ * Shared mappings base their reservation on the number of pages that
+ * are already allocated on behalf of the file. Private mappings need
+ * to reserve the full area even if read-only as mprotect() may be
+ * called to make the mapping read-write. Assume !vma is a shm mapping
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
+ resv_map = inode_resv_map(inode);
+
+ chg = region_chg(resv_map, from, to, &regions_needed);
+
+ } else {
+ /* Private mapping. */
+ resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
+
+ chg = to - from;
+
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ }
+
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
+
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
+ * of the resv_map.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+ }
+
+ /*
+ * There must be enough pages in the subpool for the mapping. If
+ * the subpool has a minimum size, there may be some global
+ * reservations already in place (gbl_reserve).
+ */
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ if (gbl_reserve < 0) {
+ ret = -ENOSPC;
+ goto out_uncharge_cgroup;
+ }
+
+ /*
+ * Check enough hugepages are available for the reservation.
+ * Hand the pages back to the subpool if there are not
+ */
+ ret = hugetlb_acct_memory(h, gbl_reserve);
+ if (ret < 0) {
+ goto out_put_pages;
+ }
+
+ /*
+ * Account for the reservations made. Shared mappings record regions
+ * that have reservations as they are shared by multiple VMAs.
+ * When the last VMA disappears, the region map says how much
+ * the reservation was and the page cache tells how much of
+ * the reservation was consumed. Private mappings are per-VMA and
+ * only the consumed reservations are tracked. When the VMA
+ * disappears, the original reservation is the VMA size and the
+ * consumed reservations are stored in the map. Hence, nothing
+ * else has to be done for private mappings here
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
+
+ if (unlikely(add < 0)) {
+ hugetlb_acct_memory(h, -gbl_reserve);
+ ret = add;
+ goto out_put_pages;
+ } else if (unlikely(chg > add)) {
+ /*
+ * pages in this range were added to the reserve
+ * map between region_chg and region_add. This
+ * indicates a race with alloc_huge_page. Adjust
+ * the subpool and reserve counts modified above
+ * based on the difference.
+ */
+ long rsv_adjust;
+
+ /*
+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+ * reference to h_cg->css. See comment below for detail.
+ */
+ hugetlb_cgroup_uncharge_cgroup_rsvd(
+ hstate_index(h),
+ (chg - add) * pages_per_huge_page(h), h_cg);
+
+ rsv_adjust = hugepage_subpool_put_pages(spool,
+ chg - add);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ } else if (h_cg) {
+ /*
+ * The file_regions will hold their own reference to
+ * h_cg->css. So we should release the reference held
+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+ * done.
+ */
+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
+ }
+ }
+ return 0;
+out_put_pages:
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), h_cg);
+out_err:
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ /* Only call region_abort if the region_chg succeeded but the
+ * region_add failed or didn't run.
+ */
+ if (chg >= 0 && add < 0)
+ region_abort(resv_map, from, to, regions_needed);
+ if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_put(&resv_map->refs, resv_map_release);
+ return ret;
+}
+
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed)
+{
+ struct hstate *h = hstate_inode(inode);
+ struct resv_map *resv_map = inode_resv_map(inode);
+ long chg = 0;
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long gbl_reserve;
+
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
+ if (resv_map) {
+ chg = region_del(resv_map, start, end);
+ /*
+ * region_del() can fail in the rare case where a region
+ * must be split and another region descriptor can not be
+ * allocated. If end == LONG_MAX, it will not fail.
+ */
+ if (chg < 0)
+ return chg;
+ }
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks -= (blocks_per_huge_page(h) * freed);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * If the subpool has a minimum size, the number of global
+ * reservations to be released may be adjusted.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -gbl_reserve);
+
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
+
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
+
+ return saddr;
+}
+
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
+ return true;
+ return false;
+}
+
+/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+ v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+ /*
+ * vma need span at least one aligned PUD size and the start,end range
+ * must at least partialy within it.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
+ (*end <= v_start) || (*start >= v_end))
+ return;
+
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ if (*start > v_start)
+ *start = ALIGN_DOWN(*start, PUD_SIZE);
+
+ if (*end < v_end)
+ *end = ALIGN(*end, PUD_SIZE);
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
+ * table entries associated with the address space. This is important as we
+ * are setting up sharing based on existing page table entries (mappings).
+ *
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
+ * huge_pte_alloc know that sharing is not possible and do not take
+ * i_mmap_rwsem as a performance optimization. This is handled by the
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
+ * only required for subsequent processing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ i_mmap_assert_locked(mapping);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr,
+ vma_mmu_pagesize(svma));
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
+ }
+ }
+
+ if (!spte)
+ goto out;
+
+ ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+ if (pud_none(*pud)) {
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ mm_inc_nr_pmds(mm);
+ } else {
+ put_page(virt_to_page(spte));
+ }
+ spin_unlock(ptl);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
+{
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ p4d_t *p4d = p4d_offset(pgd, *addr);
+ pud_t *pud = pud_offset(p4d, *addr);
+
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
+ return 0;
+
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ mm_dec_nr_pmds(mm);
+ /*
+ * This update of passed address optimizes loops sequentially
+ * processing addresses in increments of huge page size (PMD_SIZE
+ * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
+ * Update address to the 'last page' in the cleared area so that
+ * calling loop can move to first page past this area.
+ */
+ *addr |= PUD_SIZE - PMD_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
+{
+ return 0;
+}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table entry (PUD or PMD) for
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
+pte_t *huge_pte_offset(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ return NULL;
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, addr);
+ if (sz == PUD_SIZE)
+ /* must be pud huge, non-present or none */
+ return (pte_t *)pud;
+ if (!pud_present(*pud))
+ return NULL;
+ /* must have a valid entry and size to go further */
+
+ pmd = pmd_offset(pud, addr);
+ /* must be pmd huge, non-present or none */
+ return (pte_t *)pmd;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/*
+ * These functions are overwritable if your architecture needs its own
+ * behavior.
+ */
+struct page * __weak
+follow_huge_addr(struct mm_struct *mm, unsigned long address,
+ int write)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+struct page * __weak
+follow_huge_pd(struct vm_area_struct *vma,
+ unsigned long address, hugepd_t hpd, int flags, int pdshift)
+{
+ WARN(1, "hugepd follow called with no support for hugepage directory format\n");
+ return NULL;
+}
+
+struct page * __weak
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
+retry:
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ if (!ptep)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ page = pte_page(pte) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * try_grab_page() should always succeed here, because: a) we
+ * hold the pmd (ptl) lock, and b) we've just checked that the
+ * huge pmd (head) page is present in the page tables. The ptl
+ * prevents the head page and tail pages from being rearranged
+ * in any way. So this page must be available at this point,
+ * unless the page refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(pte)) {
+ spin_unlock(ptl);
+ __migration_entry_wait(mm, ptep, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
+ return page;
+}
+
+struct page * __weak
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int flags)
+{
+ if (flags & (FOLL_GET | FOLL_PIN))
+ return NULL;
+
+ return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
+
+struct page * __weak
+follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
+{
+ if (flags & (FOLL_GET | FOLL_PIN))
+ return NULL;
+
+ return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
+}
+
+int isolate_hugetlb(struct page *page, struct list_head *list)
+{
+ int ret = 0;
+
+ spin_lock(&hugetlb_lock);
+ if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ !get_page_unless_zero(page)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+ clear_page_huge_active(page);
+ list_move_tail(&page->lru, list);
+unlock:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+void putback_active_hugepage(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ spin_lock(&hugetlb_lock);
+ set_page_huge_active(page);
+ list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ put_page(page);
+}
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+ struct hstate *h = page_hstate(oldpage);
+
+ hugetlb_cgroup_migrate(oldpage, newpage);
+ set_page_owner_migrate_reason(newpage, reason);
+
+ /*
+ * transfer temporary state of the new huge page. This is
+ * reverse to other transitions because the newpage is going to
+ * be final while the old one will be freed so it takes over
+ * the temporary status.
+ *
+ * Also note that we have to transfer the per-node surplus state
+ * here as well otherwise the global surplus count will not match
+ * the per-node's.
+ */
+ if (PageHugeTemporary(newpage)) {
+ int old_nid = page_to_nid(oldpage);
+ int new_nid = page_to_nid(newpage);
+
+ SetPageHugeTemporary(oldpage);
+ ClearPageHugeTemporary(newpage);
+
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages_node[old_nid]) {
+ h->surplus_huge_pages_node[old_nid]--;
+ h->surplus_huge_pages_node[new_nid]++;
+ }
+ spin_unlock(&hugetlb_lock);
+ }
+}
+
+#ifdef CONFIG_CMA
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ hugetlb_cma_size = memparse(p, &p);
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ int nid;
+
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ return;
+ }
+
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+
+ reserved = 0;
+ for_each_node_state(nid, N_ONLINE) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ size = min(per_node, hugetlb_cma_size - reserved);
+ size = round_up(size, PAGE_SIZE << order);
+
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
+ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ 0, false, name,
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+#endif /* CONFIG_CMA */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000..1348819f5
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,812 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * Cgroup v2
+ * Copyright (C) 2019 Red Hat, Inc.
+ * Author: Giuseppe Scrivano <gscrivan@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/page_counter.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+#define hugetlb_cgroup_from_counter(counter, idx) \
+ container_of(counter, struct hugetlb_cgroup, hugepage[idx])
+
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+
+static inline struct page_counter *
+__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
+ bool rsvd)
+{
+ if (rsvd)
+ return &h_cg->rsvd_hugepage[idx];
+ return &h_cg->hugepage[idx];
+}
+
+static inline struct page_counter *
+hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
+{
+ return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
+}
+
+static inline struct page_counter *
+hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
+{
+ return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+ return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
+}
+
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+ return (h_cg == root_h_cgroup);
+}
+
+static inline struct hugetlb_cgroup *
+parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
+{
+ return hugetlb_cgroup_from_css(h_cg->css.parent);
+}
+
+static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
+{
+ int idx;
+
+ for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ if (page_counter_read(
+ hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
+ return true;
+ }
+ return false;
+}
+
+static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
+ struct hugetlb_cgroup *parent_h_cgroup)
+{
+ int idx;
+
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
+ struct page_counter *fault_parent = NULL;
+ struct page_counter *rsvd_parent = NULL;
+ unsigned long limit;
+ int ret;
+
+ if (parent_h_cgroup) {
+ fault_parent = hugetlb_cgroup_counter_from_cgroup(
+ parent_h_cgroup, idx);
+ rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
+ parent_h_cgroup, idx);
+ }
+ page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
+ idx),
+ fault_parent);
+ page_counter_init(
+ hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
+ rsvd_parent);
+
+ limit = round_down(PAGE_COUNTER_MAX,
+ 1 << huge_page_order(&hstates[idx]));
+
+ ret = page_counter_set_max(
+ hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
+ limit);
+ VM_BUG_ON(ret);
+ ret = page_counter_set_max(
+ hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
+ limit);
+ VM_BUG_ON(ret);
+ }
+}
+
+static struct cgroup_subsys_state *
+hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
+ struct hugetlb_cgroup *h_cgroup;
+
+ h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+ if (!h_cgroup)
+ return ERR_PTR(-ENOMEM);
+
+ if (!parent_h_cgroup)
+ root_h_cgroup = h_cgroup;
+
+ hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
+ return &h_cgroup->css;
+}
+
+static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct hugetlb_cgroup *h_cgroup;
+
+ h_cgroup = hugetlb_cgroup_from_css(css);
+ kfree(h_cgroup);
+}
+
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ unsigned int nr_pages;
+ struct page_counter *counter;
+ struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+
+ page_hcg = hugetlb_cgroup_from_page(page);
+ /*
+ * We can have pages in active list without any cgroup
+ * ie, hugepage with less than 3 pages. We can safely
+ * ignore those pages.
+ */
+ if (!page_hcg || page_hcg != h_cg)
+ goto out;
+
+ nr_pages = compound_nr(page);
+ if (!parent) {
+ parent = root_h_cgroup;
+ /* root has no limit */
+ page_counter_charge(&parent->hugepage[idx], nr_pages);
+ }
+ counter = &h_cg->hugepage[idx];
+ /* Take the pages off the local counter */
+ page_counter_cancel(counter, nr_pages);
+
+ set_hugetlb_cgroup(page, parent);
+out:
+ return;
+}
+
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ struct hstate *h;
+ struct page *page;
+ int idx;
+
+ do {
+ idx = 0;
+ for_each_hstate(h) {
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry(page, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(idx, h_cg, page);
+
+ spin_unlock(&hugetlb_lock);
+ idx++;
+ }
+ cond_resched();
+ } while (hugetlb_cgroup_have_usage(h_cg));
+}
+
+static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
+ enum hugetlb_memory_event event)
+{
+ atomic_long_inc(&hugetlb->events_local[idx][event]);
+ cgroup_file_notify(&hugetlb->events_local_file[idx]);
+
+ do {
+ atomic_long_inc(&hugetlb->events[idx][event]);
+ cgroup_file_notify(&hugetlb->events_file[idx]);
+ } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
+ !hugetlb_cgroup_is_root(hugetlb));
+}
+
+static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr,
+ bool rsvd)
+{
+ int ret = 0;
+ struct page_counter *counter;
+ struct hugetlb_cgroup *h_cg = NULL;
+
+ if (hugetlb_cgroup_disabled())
+ goto done;
+ /*
+ * We don't charge any cgroup if the compound page have less
+ * than 3 pages.
+ */
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ goto done;
+again:
+ rcu_read_lock();
+ h_cg = hugetlb_cgroup_from_task(current);
+ if (!css_tryget(&h_cg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ if (!page_counter_try_charge(
+ __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
+ nr_pages, &counter)) {
+ ret = -ENOMEM;
+ hugetlb_event(h_cg, idx, HUGETLB_MAX);
+ css_put(&h_cg->css);
+ goto done;
+ }
+ /* Reservations take a reference to the css because they do not get
+ * reparented.
+ */
+ if (!rsvd)
+ css_put(&h_cg->css);
+done:
+ *ptr = h_cg;
+ return ret;
+}
+
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
+}
+
+int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
+}
+
+/* Should be called with hugetlb_lock held */
+static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page, bool rsvd)
+{
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ __set_hugetlb_cgroup(page, h_cg, rsvd);
+ return;
+}
+
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
+}
+
+void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
+}
+
+/*
+ * Should be called with hugetlb_lock held
+ */
+static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page, bool rsvd)
+{
+ struct hugetlb_cgroup *h_cg;
+
+ if (hugetlb_cgroup_disabled())
+ return;
+ lockdep_assert_held(&hugetlb_lock);
+ h_cg = __hugetlb_cgroup_from_page(page, rsvd);
+ if (unlikely(!h_cg))
+ return;
+ __set_hugetlb_cgroup(page, NULL, rsvd);
+
+ page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
+ rsvd),
+ nr_pages);
+
+ if (rsvd)
+ css_put(&h_cg->css);
+
+ return;
+}
+
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
+}
+
+void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
+}
+
+static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ bool rsvd)
+{
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ return;
+
+ page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
+ rsvd),
+ nr_pages);
+
+ if (rsvd)
+ css_put(&h_cg->css);
+}
+
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
+}
+
+void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
+}
+
+void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
+ unsigned long end)
+{
+ if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
+ !resv->css)
+ return;
+
+ page_counter_uncharge(resv->reservation_counter,
+ (end - start) * resv->pages_per_hpage);
+ css_put(resv->css);
+}
+
+void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
+ struct file_region *rg,
+ unsigned long nr_pages,
+ bool region_del)
+{
+ if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
+ return;
+
+ if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ !resv->reservation_counter) {
+ page_counter_uncharge(rg->reservation_counter,
+ nr_pages * resv->pages_per_hpage);
+ /*
+ * Only do css_put(rg->css) when we delete the entire region
+ * because one file_region must hold exactly one css reference.
+ */
+ if (region_del)
+ css_put(rg->css);
+ }
+}
+
+enum {
+ RES_USAGE,
+ RES_RSVD_USAGE,
+ RES_LIMIT,
+ RES_RSVD_LIMIT,
+ RES_MAX_USAGE,
+ RES_RSVD_MAX_USAGE,
+ RES_FAILCNT,
+ RES_RSVD_FAILCNT,
+};
+
+static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct page_counter *counter;
+ struct page_counter *rsvd_counter;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+
+ counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
+ rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
+
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_RSVD_USAGE:
+ return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
+ case RES_LIMIT:
+ return (u64)counter->max * PAGE_SIZE;
+ case RES_RSVD_LIMIT:
+ return (u64)rsvd_counter->max * PAGE_SIZE;
+ case RES_MAX_USAGE:
+ return (u64)counter->watermark * PAGE_SIZE;
+ case RES_RSVD_MAX_USAGE:
+ return (u64)rsvd_counter->watermark * PAGE_SIZE;
+ case RES_FAILCNT:
+ return counter->failcnt;
+ case RES_RSVD_FAILCNT:
+ return rsvd_counter->failcnt;
+ default:
+ BUG();
+ }
+}
+
+static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
+{
+ int idx;
+ u64 val;
+ struct cftype *cft = seq_cft(seq);
+ unsigned long limit;
+ struct page_counter *counter;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+
+ idx = MEMFILE_IDX(cft->private);
+ counter = &h_cg->hugepage[idx];
+
+ limit = round_down(PAGE_COUNTER_MAX,
+ 1 << huge_page_order(&hstates[idx]));
+
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_RSVD_USAGE:
+ counter = &h_cg->rsvd_hugepage[idx];
+ fallthrough;
+ case RES_USAGE:
+ val = (u64)page_counter_read(counter);
+ seq_printf(seq, "%llu\n", val * PAGE_SIZE);
+ break;
+ case RES_RSVD_LIMIT:
+ counter = &h_cg->rsvd_hugepage[idx];
+ fallthrough;
+ case RES_LIMIT:
+ val = (u64)counter->max;
+ if (val == limit)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%llu\n", val * PAGE_SIZE);
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
+
+static DEFINE_MUTEX(hugetlb_limit_mutex);
+
+static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ const char *max)
+{
+ int ret, idx;
+ unsigned long nr_pages;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ bool rsvd = false;
+
+ if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
+ return -EINVAL;
+
+ buf = strstrip(buf);
+ ret = page_counter_memparse(buf, max, &nr_pages);
+ if (ret)
+ return ret;
+
+ idx = MEMFILE_IDX(of_cft(of)->private);
+ nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
+
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
+ case RES_RSVD_LIMIT:
+ rsvd = true;
+ fallthrough;
+ case RES_LIMIT:
+ mutex_lock(&hugetlb_limit_mutex);
+ ret = page_counter_set_max(
+ __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
+ nr_pages);
+ mutex_unlock(&hugetlb_limit_mutex);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
+}
+
+static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
+}
+
+static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ int ret = 0;
+ struct page_counter *counter, *rsvd_counter;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+
+ counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
+ rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
+
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
+ case RES_MAX_USAGE:
+ page_counter_reset_watermark(counter);
+ break;
+ case RES_RSVD_MAX_USAGE:
+ page_counter_reset_watermark(rsvd_counter);
+ break;
+ case RES_FAILCNT:
+ counter->failcnt = 0;
+ break;
+ case RES_RSVD_FAILCNT:
+ rsvd_counter->failcnt = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+ if (hsize >= (1UL << 30))
+ snprintf(buf, size, "%luGB", hsize >> 30);
+ else if (hsize >= (1UL << 20))
+ snprintf(buf, size, "%luMB", hsize >> 20);
+ else
+ snprintf(buf, size, "%luKB", hsize >> 10);
+ return buf;
+}
+
+static int __hugetlb_events_show(struct seq_file *seq, bool local)
+{
+ int idx;
+ long max;
+ struct cftype *cft = seq_cft(seq);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+
+ idx = MEMFILE_IDX(cft->private);
+
+ if (local)
+ max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
+ else
+ max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
+
+ seq_printf(seq, "max %lu\n", max);
+
+ return 0;
+}
+
+static int hugetlb_events_show(struct seq_file *seq, void *v)
+{
+ return __hugetlb_events_show(seq, false);
+}
+
+static int hugetlb_events_local_show(struct seq_file *seq, void *v)
+{
+ return __hugetlb_events_show(seq, true);
+}
+
+static void __init __hugetlb_cgroup_file_dfl_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, sizeof(buf), huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files_dfl[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->write = hugetlb_cgroup_write_dfl;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the reservation limit file */
+ cft = &h->cgroup_files_dfl[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->write = hugetlb_cgroup_write_dfl;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the current usage file */
+ cft = &h->cgroup_files_dfl[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the current reservation usage file */
+ cft = &h->cgroup_files_dfl[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the events file */
+ cft = &h->cgroup_files_dfl[4];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_events_show;
+ cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]);
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the events.local file */
+ cft = &h->cgroup_files_dfl[5];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_events_local_show;
+ cft->file_offset = offsetof(struct hugetlb_cgroup,
+ events_local_file[idx]);
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_dfl[6];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
+ h->cgroup_files_dfl));
+}
+
+static void __init __hugetlb_cgroup_file_legacy_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, sizeof(buf), huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files_legacy[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->write = hugetlb_cgroup_write_legacy;
+
+ /* Add the reservation limit file */
+ cft = &h->cgroup_files_legacy[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->write = hugetlb_cgroup_write_legacy;
+
+ /* Add the usage file */
+ cft = &h->cgroup_files_legacy[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the reservation usage file */
+ cft = &h->cgroup_files_legacy[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the MAX usage file */
+ cft = &h->cgroup_files_legacy[4];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the MAX reservation usage file */
+ cft = &h->cgroup_files_legacy[5];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the failcntfile */
+ cft = &h->cgroup_files_legacy[6];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the reservation failcntfile */
+ cft = &h->cgroup_files_legacy[7];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_legacy[8];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
+ h->cgroup_files_legacy));
+}
+
+static void __init __hugetlb_cgroup_file_init(int idx)
+{
+ __hugetlb_cgroup_file_dfl_init(idx);
+ __hugetlb_cgroup_file_legacy_init(idx);
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].private for storing cgroup details.
+ */
+ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+ __hugetlb_cgroup_file_init(hstate_index(h));
+ }
+}
+
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+ struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg_rsvd;
+ struct hstate *h = page_hstate(oldhpage);
+
+ if (hugetlb_cgroup_disabled())
+ return;
+
+ VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
+ spin_lock(&hugetlb_lock);
+ h_cg = hugetlb_cgroup_from_page(oldhpage);
+ h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
+ set_hugetlb_cgroup(oldhpage, NULL);
+ set_hugetlb_cgroup_rsvd(oldhpage, NULL);
+
+ /* move the h_cg details to new cgroup */
+ set_hugetlb_cgroup(newhpage, h_cg);
+ set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
+ list_move(&newhpage->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ return;
+}
+
+static struct cftype hugetlb_files[] = {
+ {} /* terminate */
+};
+
+struct cgroup_subsys hugetlb_cgrp_subsys = {
+ .css_alloc = hugetlb_cgroup_css_alloc,
+ .css_offline = hugetlb_cgroup_css_offline,
+ .css_free = hugetlb_cgroup_css_free,
+ .dfl_cftypes = hugetlb_files,
+ .legacy_cftypes = hugetlb_files,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000..1ae1ebc2b
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Inject a hwpoison memory failure on a arbitrary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
+
+static struct dentry *hwpoison_dir;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+ unsigned long pfn = val;
+ struct page *p;
+ struct page *hpage;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+
+ p = pfn_to_page(pfn);
+ hpage = compound_head(p);
+
+ if (!hwpoison_filter_enable)
+ goto inject;
+
+ shake_page(hpage, 0);
+ /*
+ * This implies unable to support non-LRU pages.
+ */
+ if (!PageLRU(hpage) && !PageHuge(p))
+ return 0;
+
+ /*
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
+ * memory_failure() will redo the check reliably inside page lock.
+ */
+ err = hwpoison_filter(hpage);
+ if (err)
+ return 0;
+
+inject:
+ pr_info("Injecting memory failure at pfn %#lx\n", pfn);
+ return memory_failure(pfn, 0);
+}
+
+static int hwpoison_unpoison(void *data, u64 val)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return unpoison_memory(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+ debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+ hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+
+ /*
+ * Note that the below poison/unpoison interfaces do not involve
+ * hardware status change, hence do not require hardware support.
+ * They are mainly for testing hwpoison in software level.
+ */
+ debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL,
+ &hwpoison_fops);
+
+ debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL,
+ &unpoison_fops);
+
+ debugfs_create_u32("corrupt-filter-enable", 0600, hwpoison_dir,
+ &hwpoison_filter_enable);
+
+ debugfs_create_u32("corrupt-filter-dev-major", 0600, hwpoison_dir,
+ &hwpoison_filter_dev_major);
+
+ debugfs_create_u32("corrupt-filter-dev-minor", 0600, hwpoison_dir,
+ &hwpoison_filter_dev_minor);
+
+ debugfs_create_u64("corrupt-filter-flags-mask", 0600, hwpoison_dir,
+ &hwpoison_filter_flags_mask);
+
+ debugfs_create_u64("corrupt-filter-flags-value", 0600, hwpoison_dir,
+ &hwpoison_filter_flags_value);
+
+#ifdef CONFIG_MEMCG
+ debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir,
+ &hwpoison_filter_memcg);
+#endif
+
+ return 0;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000..153162669
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm_types.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+
+#include <linux/atomic.h>
+#include <linux/user_namespace.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
+
+/*
+ * For dynamically allocated mm_structs, there is a dynamically sized cpumask
+ * at the end of the structure, the size of which depends on the maximum CPU
+ * number the system can see. That way we allocate only as much memory for
+ * mm_cpumask() as needed for the hundreds, or thousands of processes that
+ * a system typically runs.
+ *
+ * Since there is only one init_mm in the entire system, keep it simple
+ * and size this cpu_bitmask to NR_CPUS.
+ */
+struct mm_struct init_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
+ MMAP_LOCK_INITIALIZER(init_mm)
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .user_ns = &init_user_ns,
+ .cpu_bitmap = CPU_BITS_NONE,
+ INIT_MM_CONTEXT(init_mm)
+};
diff --git a/mm/internal.h b/mm/internal.h
new file mode 100644
index 000000000..840b8a330
--- /dev/null
+++ b/mm/internal.h
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
+
+/*
+ * The set of flags that only affect watermark checking and reclaim
+ * behaviour. This is used by the MM to obey the caller constraints
+ * about IO, FS and watermark checking while ignoring placement
+ * hints such as HIGHMEM usage.
+ */
+#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
+ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
+ __GFP_ATOMIC)
+
+/* The GFP flags allowed during early boot */
+#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
+
+/* Control allocation cpuset and node placement constraints */
+#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
+
+/* Do not use these with a slab allocator */
+#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+
+void page_writeback_init(void);
+
+vm_fault_t do_swap_page(struct vm_fault *vmf);
+
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+}
+
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
+
+void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
+{
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
+}
+
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+
+/**
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+static inline bool page_evictable(struct page *page)
+{
+ bool ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(page_ref_count(page), page);
+ set_page_count(page, 1);
+}
+
+extern unsigned long highest_memmap_pfn;
+
+/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+
+/*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
+ * in mm/page_alloc.c
+ */
+
+/*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and highest_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and highest_zoneidx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole structure
+ * by a const pointer.
+ */
+struct alloc_context {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ struct zoneref *preferred_zoneref;
+ int migratetype;
+
+ /*
+ * highest_zoneidx represents highest usable zone index of
+ * the allocation request. Due to the nature of the zone,
+ * memory on lower zone than the highest_zoneidx will be
+ * protected by lowmem_reserve[highest_zoneidx].
+ *
+ * highest_zoneidx is also used by reclaim/compaction to limit
+ * the target zone since higher zone than this index cannot be
+ * usable for this allocation request.
+ */
+ enum zone_type highest_zoneidx;
+ bool spread_dirty_pages;
+};
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ * B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ * P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ */
+static inline unsigned long
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
+{
+ return page_pfn ^ (1 << order);
+}
+
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
+extern int __isolate_free_page(struct page *page, unsigned int order);
+extern void __putback_isolated_page(struct page *page, unsigned int order,
+ int mt);
+extern void memblock_free_pages(struct page *page, unsigned long pfn,
+ unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
+extern void prep_compound_page(struct page *page, unsigned int order);
+extern void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags);
+extern int user_min_free_kbytes;
+
+extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_reset(struct zone *zone);
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+/*
+ * in mm/compaction.c
+ */
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned int nr_freepages; /* Number of isolated free pages */
+ unsigned int nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long fast_start_pfn; /* a pfn to start linear scan from */
+ struct zone *zone;
+ unsigned long total_migrate_scanned;
+ unsigned long total_free_scanned;
+ unsigned short fast_search_fail;/* failures to use free list searches */
+ short search_order; /* order to start a fast search at */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
+ int order; /* order a direct compactor needs */
+ int migratetype; /* migratetype of direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
+ const int highest_zoneidx; /* zone index of a direct compactor */
+ enum migrate_mode mode; /* Async or sync migration mode */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool no_set_skip_hint; /* Don't mark blocks for skipping */
+ bool ignore_block_suitable; /* Scan blocks considered unsuitable */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool proactive_compaction; /* kcompactd proactive compaction */
+ bool whole_zone; /* Whole zone should/has been scanned */
+ bool contended; /* Signal lock or sched contention */
+ bool rescan; /* Rescanning the same pageblock */
+ bool alloc_contig; /* alloc_contig_range allocation */
+};
+
+/*
+ * Used in direct compaction when a page should be taken from the freelists
+ * immediately when one is created during the free path.
+ */
+struct capture_control {
+ struct compact_control *cc;
+ struct page *page;
+};
+
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn);
+unsigned long
+isolate_migratepages_range(struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
+
+#endif
+
+/*
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
+ */
+static inline unsigned int buddy_order(struct page *page)
+{
+ /* PageBuddy() must be checked by the caller */
+ return page_private(page);
+}
+
+/*
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * READ_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
+
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+ return (flags & VM_STACK) == VM_STACK;
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev);
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
+
+#ifdef CONFIG_MMU
+extern long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+ munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+/*
+ * must be called with vma's mmap_lock held for read or write, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+extern unsigned int munlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked(). This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void clear_page_mlock(struct page *page);
+
+/*
+ * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
+ * (because that does not go through the full procedure of migration ptes):
+ * to migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+ if (TestClearPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
+
+ /* Holding pmd lock, no change in irq context: __mod is safe */
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ SetPageMlocked(newpage);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
+ }
+}
+
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
+/*
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
+}
+
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address_end(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
+}
+
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
+{
+ int flags = vmf->flags;
+
+ if (fpin)
+ return fpin;
+
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_lock if only
+ * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
+ */
+ if (fault_flag_allow_retry_first(flags) &&
+ !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ fpin = get_file(vmf->vma->vm_file);
+ mmap_read_unlock(vmf->vma->vm_mm);
+ }
+ return fpin;
+}
+
+#else /* !CONFIG_MMU */
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+
+#endif /* !CONFIG_MMU */
+
+/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'. Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+ if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+ return nth_page(base, offset);
+ return base + offset;
+}
+
+/*
+ * Iterator over all subpages within the maximally aligned gigantic
+ * page 'base'. Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+ struct page *base, int offset)
+{
+ if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+ unsigned long pfn = page_to_pfn(base) + offset;
+ if (!pfn_valid(pfn))
+ return NULL;
+ return pfn_to_page(pfn);
+ }
+ return iter + 1;
+}
+
+/* Memory initialisation debug and verification */
+enum mminit_level {
+ MMINIT_WARNING,
+ MMINIT_VERIFY,
+ MMINIT_TRACE
+};
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+
+extern int mminit_loglevel;
+
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+ if (level < mminit_loglevel) { \
+ if (level <= MMINIT_WARNING) \
+ pr_warn("mminit::" prefix " " fmt, ##arg); \
+ else \
+ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
+ } \
+} while (0)
+
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_zonelist(void);
+#else
+
+static inline void mminit_dprintk(enum mminit_level level,
+ const char *prefix, const char *fmt, ...)
+{
+}
+
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
+
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
+{
+ return NODE_RECLAIM_NOSCAN;
+}
+#endif
+
+extern int hwpoison_filter(struct page *p);
+
+extern u32 hwpoison_filter_dev_major;
+extern u32 hwpoison_filter_dev_minor;
+extern u64 hwpoison_filter_flags_mask;
+extern u64 hwpoison_filter_flags_value;
+extern u64 hwpoison_filter_memcg;
+extern u32 hwpoison_filter_enable;
+
+extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+extern void set_pageblock_order(void);
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN WMARK_MIN
+#define ALLOC_WMARK_LOW WMARK_LOW
+#define ALLOC_WMARK_HIGH WMARK_HIGH
+#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM 0x08
+#else
+#define ALLOC_OOM ALLOC_NO_WATERMARKS
+#endif
+
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#ifdef CONFIG_ZONE_DMA32
+#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
+#else
+#define ALLOC_NOFRAGMENT 0x0
+#endif
+#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+ return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+ return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
+void setup_zone_pageset(struct zone *zone);
+
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+};
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000..11c75fb07
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff + vma_pages(v) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+ unsigned long, shared.rb_subtree_last,
+ vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+ struct vm_area_struct *prev,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link;
+ struct vm_area_struct *parent;
+ unsigned long last = vma_last_pgoff(node);
+
+ VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
+
+ if (!prev->shared.rb.rb_right) {
+ parent = prev;
+ link = &prev->shared.rb.rb_right;
+ } else {
+ parent = rb_entry(prev->shared.rb.rb_right,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
+ while (parent->shared.rb.rb_left) {
+ parent = rb_entry(parent->shared.rb.rb_left,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
+ }
+ link = &parent->shared.rb.rb_left;
+ }
+
+ node->shared.rb_subtree_last = last;
+ rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+ rb_insert_augmented(&node->shared.rb, &root->rb_root,
+ &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+ avc_start_pgoff, avc_last_pgoff,
+ static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+ struct rb_root_cached *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+ node->cached_vma_start = avc_start_pgoff(node);
+ node->cached_vma_last = avc_last_pgoff(node);
+#endif
+ __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+ struct rb_root_cached *root)
+{
+ __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+ WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+ WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/ioremap.c b/mm/ioremap.c
new file mode 100644
index 000000000..5fa1ab41d
--- /dev/null
+++ b/mm/ioremap.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <asm/cacheflush.h>
+
+#include "pgalloc-track.h"
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_p4d_capable;
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_huge_disabled = 1;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+
+void __init ioremap_huge_init(void)
+{
+ if (!ioremap_huge_disabled) {
+ if (arch_ioremap_p4d_supported())
+ ioremap_p4d_capable = 1;
+ if (arch_ioremap_pud_supported())
+ ioremap_pud_capable = 1;
+ if (arch_ioremap_pmd_supported())
+ ioremap_pmd_capable = 1;
+ }
+}
+
+static inline int ioremap_p4d_enabled(void)
+{
+ return ioremap_p4d_capable;
+}
+
+static inline int ioremap_pud_enabled(void)
+{
+ return ioremap_pud_capable;
+}
+
+static inline int ioremap_pmd_enabled(void)
+{
+ return ioremap_pmd_capable;
+}
+
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_p4d_enabled(void) { return 0; }
+static inline int ioremap_pud_enabled(void) { return 0; }
+static inline int ioremap_pmd_enabled(void) { return 0; }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pmd_enabled())
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pud_enabled())
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_p4d_enabled())
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+int ioremap_page_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
+ &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ flush_cache_vmap(start, end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IOREMAP
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+{
+ unsigned long offset, vaddr;
+ phys_addr_t last_addr;
+ struct vm_struct *area;
+
+ /* Disallow wrap-around or zero size */
+ last_addr = addr + size - 1;
+ if (!size || last_addr < addr)
+ return NULL;
+
+ /* Page-align mappings */
+ offset = addr & (~PAGE_MASK);
+ addr -= offset;
+ size = PAGE_ALIGN(size + offset);
+
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ vaddr = (unsigned long)area->addr;
+
+ if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ return (void __iomem *)(vaddr + offset);
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+void iounmap(volatile void __iomem *addr)
+{
+ vunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+EXPORT_SYMBOL(iounmap);
+#endif /* CONFIG_GENERIC_IOREMAP */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000..370d970e5
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+# Disable ftrace to avoid recursion.
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
+
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack)
+CC_FLAGS_KASAN_RUNTIME += -fno-stack-protector
+# Disable branch tracing to avoid recursion.
+CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
+
+CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+
+obj-$(CONFIG_KASAN) := common.o init.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
+obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
new file mode 100644
index 000000000..950fd372a
--- /dev/null
+++ b/mm/kasan/common.c
@@ -0,0 +1,931 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common generic and tag-based KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ nr_entries = filter_irq_stacks(entries, nr_entries);
+ return stack_depot_save(entries, nr_entries, flags);
+}
+
+void kasan_set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = kasan_save_stack(flags);
+}
+
+void kasan_enable_current(void)
+{
+ current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+ current->kasan_depth--;
+}
+
+bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_read);
+
+bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+
+#ifdef __HAVE_ARCH_MEMMOVE
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+#endif
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ __memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+ u8 tag = get_tag(address);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
+ kasan_poison_shadow(address, size, tag);
+
+ if (size & KASAN_SHADOW_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ *shadow = tag;
+ else
+ *shadow = size & KASAN_SHADOW_MASK;
+ }
+}
+
+static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
+{
+ void *base = task_stack_page(task);
+ size_t size = sp - base;
+
+ kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+ __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
+{
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+ u8 tag;
+ unsigned long i;
+
+ if (unlikely(PageHighMem(page)))
+ return;
+
+ tag = random_tag();
+ for (i = 0; i < (1 << order); i++)
+ page_kasan_tag_set(page + i, tag);
+ kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
+}
+
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static inline unsigned int optimal_redzone(unsigned int object_size)
+{
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return 0;
+
+ return
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
+{
+ unsigned int orig_size = *size;
+ unsigned int redzone_size;
+ int redzone_adjust;
+
+ /* Add alloc meta. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /* Add free meta. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta))) {
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+ }
+
+ redzone_size = optimal_redzone(cache->object_size);
+ redzone_adjust = redzone_size - (*size - cache->object_size);
+ if (redzone_adjust > 0)
+ *size += redzone_adjust;
+
+ *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
+ max(*size, cache->object_size + redzone_size));
+
+ /*
+ * If the metadata doesn't fit, don't enable KASAN at all.
+ */
+ if (*size <= cache->kasan_info.alloc_meta_offset ||
+ *size <= cache->kasan_info.free_meta_offset) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ cache->kasan_info.free_meta_offset = 0;
+ *size = orig_size;
+ return;
+ }
+
+ *flags |= SLAB_KASAN;
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ return (cache->kasan_info.alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ (cache->kasan_info.free_meta_offset ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
+{
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+void kasan_poison_slab(struct page *page)
+{
+ unsigned long i;
+
+ for (i = 0; i < compound_nr(page); i++)
+ page_kasan_tag_reset(page + i);
+ kasan_poison_shadow(page_address(page), page_size(page),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_poison_shadow(object,
+ round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+ KASAN_KMALLOC_REDZONE);
+}
+
+/*
+ * This function assigns a tag to an object considering the following:
+ * 1. A cache might have a constructor, which might save a pointer to a slab
+ * object somewhere (e.g. in the object itself). We preassign a tag for
+ * each object in caches with constructors during slab creation and reuse
+ * the same tag each time a particular object is allocated.
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
+ * accessed after being freed. We preassign tags for objects in these
+ * caches as well.
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
+ * is stored as an array of indexes instead of a linked list. Assign tags
+ * based on objects indexes, so that objects that are next to each other
+ * get different tags.
+ */
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+ bool init, bool keep_tag)
+{
+ /*
+ * 1. When an object is kmalloc()'ed, two hooks are called:
+ * kasan_slab_alloc() and kasan_kmalloc(). We assign the
+ * tag only in the first one.
+ * 2. We reuse the same tag for krealloc'ed objects.
+ */
+ if (keep_tag)
+ return get_tag(object);
+
+ /*
+ * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+ * set, assign a tag when the object is being allocated (init == false).
+ */
+ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
+ return init ? KASAN_TAG_KERNEL : random_tag();
+
+ /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
+#ifdef CONFIG_SLAB
+ /* For SLAB assign tags based on the object index in the freelist. */
+ return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+#else
+ /*
+ * For SLUB assign a random tag during slab creation, otherwise reuse
+ * the already assigned tag.
+ */
+ return init ? random_tag() : get_tag(object);
+#endif
+}
+
+void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+ const void *object)
+{
+ struct kasan_alloc_meta *alloc_info;
+
+ if (!(cache->flags & SLAB_KASAN))
+ return (void *)object;
+
+ alloc_info = get_alloc_info(cache, object);
+ __memset(alloc_info, 0, sizeof(*alloc_info));
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ object = set_tag(object,
+ assign_tag(cache, object, true, false));
+
+ return (void *)object;
+}
+
+static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
+{
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return shadow_byte < 0 ||
+ shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
+
+ /* else CONFIG_KASAN_SW_TAGS: */
+ if ((u8)shadow_byte == KASAN_TAG_INVALID)
+ return true;
+ if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
+ return true;
+
+ return false;
+}
+
+static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine)
+{
+ s8 shadow_byte;
+ u8 tag;
+ void *tagged_object;
+ unsigned long rounded_up_size;
+
+ tag = get_tag(object);
+ tagged_object = object;
+ object = reset_tag(object);
+
+ if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
+ object)) {
+ kasan_report_invalid_free(tagged_object, ip);
+ return true;
+ }
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
+ return false;
+
+ shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
+ if (shadow_invalid(tag, shadow_byte)) {
+ kasan_report_invalid_free(tagged_object, ip);
+ return true;
+ }
+
+ rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
+ unlikely(!(cache->flags & SLAB_KASAN)))
+ return false;
+
+ kasan_set_free_info(cache, object, tag);
+
+ quarantine_put(get_free_info(cache, object), cache);
+
+ return IS_ENABLED(CONFIG_KASAN_GENERIC);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+{
+ return __kasan_slab_free(cache, object, ip, true);
+}
+
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags, bool keep_tag)
+{
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+ u8 tag = 0xff;
+
+ if (gfpflags_allow_blocking(flags))
+ quarantine_reduce();
+
+ if (unlikely(object == NULL))
+ return NULL;
+
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = round_up((unsigned long)object + cache->object_size,
+ KASAN_SHADOW_SCALE_SIZE);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ tag = assign_tag(cache, object, false, keep_tag);
+
+ /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+ kasan_unpoison_shadow(set_tag(object, tag), size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
+
+ if (cache->flags & SLAB_KASAN)
+ kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+
+ return set_tag(object, tag);
+}
+
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+ gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, size, flags, true);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
+{
+ struct page *page;
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (gfpflags_allow_blocking(flags))
+ quarantine_reduce();
+
+ if (unlikely(ptr == NULL))
+ return NULL;
+
+ page = virt_to_page(ptr);
+ redzone_start = round_up((unsigned long)(ptr + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = (unsigned long)ptr + page_size(page);
+
+ kasan_unpoison_shadow(ptr, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
+
+ return (void *)ptr;
+}
+
+void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return (void *)object;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page)))
+ return kasan_kmalloc_large(object, size, flags);
+ else
+ return __kasan_kmalloc(page->slab_cache, object, size,
+ flags, true);
+}
+
+void kasan_poison_kfree(void *ptr, unsigned long ip)
+{
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+
+ if (unlikely(!PageSlab(page))) {
+ if (ptr != page_address(page)) {
+ kasan_report_invalid_free(ptr, ip);
+ return;
+ }
+ kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
+ } else {
+ __kasan_slab_free(page->slab_cache, ptr, ip, false);
+ }
+}
+
+void kasan_kfree_large(void *ptr, unsigned long ip)
+{
+ if (ptr != page_address(virt_to_head_page(ptr)))
+ kasan_report_invalid_free(ptr, ip);
+ /* The object will be poisoned by page_alloc. */
+}
+
+#ifndef CONFIG_KASAN_VMALLOC
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t scaled_size;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (ret) {
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+ find_vm_area(addr)->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+ if (vm->flags & VM_KASAN)
+ vfree(kasan_mem_to_shadow(vm->addr));
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
+ WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+core_initcall(kasan_memhotplug_init);
+#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+ pte_t pte;
+
+ if (likely(!pte_none(*ptep)))
+ return 0;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pte_none(*ptep))) {
+ set_pte_at(&init_mm, addr, ptep, pte);
+ page = 0;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (page)
+ free_page(page);
+ return 0;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+{
+ unsigned long shadow_start, shadow_end;
+ int ret;
+
+ if (!is_vmalloc_or_module_addr((void *)addr))
+ return 0;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
+ shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
+ shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
+ shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+
+ ret = apply_to_page_range(&init_mm, shadow_start,
+ shadow_end - shadow_start,
+ kasan_populate_vmalloc_pte, NULL);
+ if (ret)
+ return ret;
+
+ flush_cache_vmap(shadow_start, shadow_end);
+
+ /*
+ * We need to be careful about inter-cpu effects here. Consider:
+ *
+ * CPU#0 CPU#1
+ * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
+ * p[99] = 1;
+ *
+ * With compiler instrumentation, that ends up looking like this:
+ *
+ * CPU#0 CPU#1
+ * // vmalloc() allocates memory
+ * // let a = area->addr
+ * // we reach kasan_populate_vmalloc
+ * // and call kasan_unpoison_shadow:
+ * STORE shadow(a), unpoison_val
+ * ...
+ * STORE shadow(a+99), unpoison_val x = LOAD p
+ * // rest of vmalloc process <data dependency>
+ * STORE p, a LOAD shadow(x+99)
+ *
+ * If there is no barrier between the end of unpoisioning the shadow
+ * and the store of the result to p, the stores could be committed
+ * in a different order by CPU#0, and CPU#1 could erroneously observe
+ * poison in the shadow.
+ *
+ * We need some sort of barrier between the stores.
+ *
+ * In the vmalloc() case, this is provided by a smp_wmb() in
+ * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
+ * get_vm_area() and friends, the caller gets shadow allocated but
+ * doesn't have any pages mapped into the virtual address space that
+ * has been reserved. Mapping those pages in will involve taking and
+ * releasing a page-table lock, which will provide the barrier.
+ */
+
+ return 0;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
+}
+
+void kasan_unpoison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ kasan_unpoison_shadow(start, size);
+}
+
+static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+
+ page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+
+ spin_lock(&init_mm.page_table_lock);
+
+ if (likely(!pte_none(*ptep))) {
+ pte_clear(&init_mm, addr, ptep);
+ free_page(page);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Release the backing for the vmalloc region [start, end), which
+ * lies within the free region [free_region_start, free_region_end).
+ *
+ * This can be run lazily, long after the region was freed. It runs
+ * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
+ * infrastructure.
+ *
+ * How does this work?
+ * -------------------
+ *
+ * We have a region that is page aligned, labelled as A.
+ * That might not map onto the shadow in a way that is page-aligned:
+ *
+ * start end
+ * v v
+ * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |??AAAAAA|AAAAAAAA|AA??????| < shadow
+ * (1) (2) (3)
+ *
+ * First we align the start upwards and the end downwards, so that the
+ * shadow of the region aligns with shadow page boundaries. In the
+ * example, this gives us the shadow page (2). This is the shadow entirely
+ * covered by this allocation.
+ *
+ * Then we have the tricky bits. We want to know if we can free the
+ * partially covered shadow pages - (1) and (3) in the example. For this,
+ * we are given the start and end of the free region that contains this
+ * allocation. Extending our previous example, we could have:
+ *
+ * free_region_start free_region_end
+ * | start end |
+ * v v v v
+ * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
+ * (1) (2) (3)
+ *
+ * Once again, we align the start of the free region up, and the end of
+ * the free region down so that the shadow is page aligned. So we can free
+ * page (1) - we know no allocation currently uses anything in that page,
+ * because all of it is in the vmalloc free region. But we cannot free
+ * page (3), because we can't be sure that the rest of it is unused.
+ *
+ * We only consider pages that contain part of the original region for
+ * freeing: we don't try to free other pages from the free region or we'd
+ * end up trying to free huge chunks of virtual address space.
+ *
+ * Concurrency
+ * -----------
+ *
+ * How do we know that we're not freeing a page that is simultaneously
+ * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
+ *
+ * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
+ * at the same time. While we run under free_vmap_area_lock, the population
+ * code does not.
+ *
+ * free_vmap_area_lock instead operates to ensure that the larger range
+ * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
+ * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
+ * no space identified as free will become used while we are running. This
+ * means that so long as we are careful with alignment and only free shadow
+ * pages entirely covered by the free region, we will not run in to any
+ * trouble - any simultaneous allocations will be for disjoint regions.
+ */
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end)
+{
+ void *shadow_start, *shadow_end;
+ unsigned long region_start, region_end;
+ unsigned long size;
+
+ region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+ region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ free_region_start = ALIGN(free_region_start,
+ PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ if (start != region_start &&
+ free_region_start < region_start)
+ region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
+
+ free_region_end = ALIGN_DOWN(free_region_end,
+ PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
+
+ if (end != region_end &&
+ free_region_end > region_end)
+ region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
+
+ shadow_start = kasan_mem_to_shadow((void *)region_start);
+ shadow_end = kasan_mem_to_shadow((void *)region_end);
+
+ if (shadow_end > shadow_start) {
+ size = shadow_end - shadow_start;
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)shadow_start,
+ size, kasan_depopulate_vmalloc_pte,
+ NULL);
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
+ }
+}
+#endif
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
new file mode 100644
index 000000000..248264b9c
--- /dev/null
+++ b/mm/kasan/generic.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core generic KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(shadow_value)) {
+ s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ return unlikely(last_accessible_byte >= shadow_value);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+ unsigned long size)
+{
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+
+ /*
+ * Access crosses 8(shadow size)-byte boundary. Such access maps
+ * into 2 shadow bytes, so we need to check them both.
+ */
+ if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
+
+ return memory_is_poisoned_1(addr + size - 1);
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+ if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ return *shadow_addr || memory_is_poisoned_1(addr + 15);
+
+ return *shadow_addr;
+}
+
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
+ size_t size)
+{
+ while (size) {
+ if (unlikely(*start))
+ return (unsigned long)start;
+ start++;
+ size--;
+ }
+
+ return 0;
+}
+
+static __always_inline unsigned long memory_is_nonzero(const void *start,
+ const void *end)
+{
+ unsigned int words;
+ unsigned long ret;
+ unsigned int prefix = (unsigned long)start % 8;
+
+ if (end - start <= 16)
+ return bytes_is_nonzero(start, end - start);
+
+ if (prefix) {
+ prefix = 8 - prefix;
+ ret = bytes_is_nonzero(start, prefix);
+ if (unlikely(ret))
+ return ret;
+ start += prefix;
+ }
+
+ words = (end - start) / 8;
+ while (words) {
+ if (unlikely(*(u64 *)start))
+ return bytes_is_nonzero(start, 8);
+ start += 8;
+ words--;
+ }
+
+ return bytes_is_nonzero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+ size_t size)
+{
+ unsigned long ret;
+
+ ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
+ kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+ if (unlikely(ret)) {
+ unsigned long last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+ if (unlikely(ret != (unsigned long)last_shadow ||
+ ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+ if (__builtin_constant_p(size)) {
+ switch (size) {
+ case 1:
+ return memory_is_poisoned_1(addr);
+ case 2:
+ case 4:
+ case 8:
+ return memory_is_poisoned_2_4_8(addr, size);
+ case 16:
+ return memory_is_poisoned_16(addr);
+ default:
+ BUILD_BUG();
+ }
+ }
+
+ return memory_is_poisoned_n(addr, size);
+}
+
+static __always_inline bool check_memory_region_inline(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
+{
+ if (unlikely(size == 0))
+ return true;
+
+ if (unlikely(addr + size < addr))
+ return !kasan_report(addr, size, write, ret_ip);
+
+ if (unlikely((void *)addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ return !kasan_report(addr, size, write, ret_ip);
+ }
+
+ if (likely(!memory_is_poisoned(addr, size)))
+ return true;
+
+ return !kasan_report(addr, size, write, ret_ip);
+}
+
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ return check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
+void kasan_cache_shutdown(struct kmem_cache *cache)
+{
+ if (!__kmem_cache_empty(cache))
+ quarantine_remove_cache(cache);
+}
+
+static void register_global(struct kasan_global *global)
+{
+ size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(global->beg, global->size);
+
+ kasan_poison_shadow(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, false, _RET_IP_);\
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+/* Emitted by compiler to poison alloca()ed objects. */
+void __asan_alloca_poison(unsigned long addr, size_t size)
+{
+ size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
+ rounded_up_size;
+ size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+
+ const void *left_redzone = (const void *)(addr -
+ KASAN_ALLOCA_REDZONE_SIZE);
+ const void *right_redzone = (const void *)(addr + rounded_up_size);
+
+ WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+
+ kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ kasan_poison_shadow(right_redzone,
+ padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
+}
+EXPORT_SYMBOL(__asan_alloca_poison);
+
+/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+{
+ if (unlikely(!stack_top || stack_top > stack_bottom))
+ return;
+
+ kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+}
+EXPORT_SYMBOL(__asan_allocas_unpoison);
+
+/* Emitted by the compiler to [un]poison local variables. */
+#define DEFINE_ASAN_SET_SHADOW(byte) \
+ void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ { \
+ __memset((void *)addr, 0x##byte, size); \
+ } \
+ EXPORT_SYMBOL(__asan_set_shadow_##byte)
+
+DEFINE_ASAN_SET_SHADOW(00);
+DEFINE_ASAN_SET_SHADOW(f1);
+DEFINE_ASAN_SET_SHADOW(f2);
+DEFINE_ASAN_SET_SHADOW(f3);
+DEFINE_ASAN_SET_SHADOW(f5);
+DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+ struct page *page = kasan_addr_to_page(addr);
+ struct kmem_cache *cache;
+ struct kasan_alloc_meta *alloc_info;
+ void *object;
+
+ if (!(page && PageSlab(page)))
+ return;
+
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, addr);
+ alloc_info = get_alloc_info(cache, object);
+
+ /*
+ * record the last two call_rcu() call stacks.
+ */
+ alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+ alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_free_meta *free_meta;
+
+ free_meta = get_free_info(cache, object);
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+
+ /*
+ * the object was freed and has free track set
+ */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ return NULL;
+ return &get_free_info(cache, object)->free_track;
+}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
new file mode 100644
index 000000000..a38c7a9e1
--- /dev/null
+++ b/mm/kasan/generic_report.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ void *p = addr;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ case KASAN_VMALLOC_INVALID:
+ bug_type = "vmalloc-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ if (addr_has_shadow(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
new file mode 100644
index 000000000..b8c6ec172
--- /dev/null
+++ b/mm/kasan/init.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains some kasan initialization code.
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+
+#include "kasan.h"
+
+/*
+ * This page serves two purposes:
+ * - It used as early shadow memory. The entire shadow region populated
+ * with this page, before we will be able to setup normal shadow memory.
+ * - Latter it reused it as zero shadow to cover large ranges of memory
+ * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
+ */
+unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss;
+
+#if CONFIG_PGTABLE_LEVELS > 4
+p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d));
+}
+#else
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return false;
+}
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
+}
+#else
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return false;
+}
+#endif
+#if CONFIG_PGTABLE_LEVELS > 2
+pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
+}
+#else
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return false;
+}
+#endif
+pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
+}
+
+static inline bool kasan_early_shadow_page_entry(pte_t pte)
+{
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page));
+}
+
+static __init void *early_alloc(size_t size, int node)
+{
+ void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
+ MEMBLOCK_ALLOC_ACCESSIBLE, node);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n",
+ __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS));
+
+ return ptr;
+}
+
+static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+ pte_t zero_pte;
+
+ zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)),
+ PAGE_KERNEL);
+ zero_pte = pte_wrprotect(zero_pte);
+
+ while (addr + PAGE_SIZE <= end) {
+ set_pte_at(&init_mm, addr, pte, zero_pte);
+ addr += PAGE_SIZE;
+ pte = pte_offset_kernel(pmd, addr);
+ }
+}
+
+static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ unsigned long next;
+
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
+ continue;
+ }
+
+ if (pmd_none(*pmd)) {
+ pte_t *p;
+
+ if (slab_is_available())
+ p = pte_alloc_one_kernel(&init_mm);
+ else
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+ zero_pte_populate(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
+ unsigned long end)
+{
+ pud_t *pud = pud_offset(p4d, addr);
+ unsigned long next;
+
+ do {
+ next = pud_addr_end(addr, end);
+ if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
+ pmd_t *pmd;
+
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
+ continue;
+ }
+
+ if (pud_none(*pud)) {
+ pmd_t *p;
+
+ if (slab_is_available()) {
+ p = pmd_alloc(&init_mm, pud, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pud_populate(&init_mm, pud,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ }
+ zero_pmd_populate(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+ unsigned long end)
+{
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ unsigned long next;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
+ pud_t *pud;
+ pmd_t *pmd;
+
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
+ pud = pud_offset(p4d, addr);
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
+ continue;
+ }
+
+ if (p4d_none(*p4d)) {
+ pud_t *p;
+
+ if (slab_is_available()) {
+ p = pud_alloc(&init_mm, p4d, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ p4d_populate(&init_mm, p4d,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ }
+ zero_pud_populate(p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+/**
+ * kasan_populate_early_shadow - populate shadow memory region with
+ * kasan_early_shadow_page
+ * @shadow_start - start of the memory range to populate
+ * @shadow_end - end of the memory range to populate
+ */
+int __ref kasan_populate_early_shadow(const void *shadow_start,
+ const void *shadow_end)
+{
+ unsigned long addr = (unsigned long)shadow_start;
+ unsigned long end = (unsigned long)shadow_end;
+ pgd_t *pgd = pgd_offset_k(addr);
+ unsigned long next;
+
+ do {
+ next = pgd_addr_end(addr, end);
+
+ if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ /*
+ * kasan_early_shadow_pud should be populated with pmds
+ * at this moment.
+ * [pud,pmd]_populate*() below needed only for
+ * 3,2 - level page tables where we don't have
+ * puds,pmds, so pgd_populate(), pud_populate()
+ * is noops.
+ */
+ pgd_populate(&init_mm, pgd,
+ lm_alias(kasan_early_shadow_p4d));
+ p4d = p4d_offset(pgd, addr);
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
+ pud = pud_offset(p4d, addr);
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
+ continue;
+ }
+
+ if (pgd_none(*pgd)) {
+ p4d_t *p;
+
+ if (slab_is_available()) {
+ p = p4d_alloc(&init_mm, pgd, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pgd_populate(&init_mm, pgd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ }
+ zero_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd)));
+ pmd_clear(pmd);
+}
+
+static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud)));
+ pud_clear(pud);
+}
+
+static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d)));
+ p4d_clear(p4d);
+}
+
+static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_start + i;
+ if (!p4d_none(*p4d))
+ return;
+ }
+
+ p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd)));
+ pgd_clear(pgd);
+}
+
+static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
+ continue;
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pmd++) {
+ pte_t *pte;
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (kasan_pte_table(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ pmd_clear(pmd);
+ continue;
+ }
+ }
+ pte = pte_offset_kernel(pmd, addr);
+ kasan_remove_pte_table(pte, addr, next);
+ kasan_free_pte(pte_offset_kernel(pmd, 0), pmd);
+ }
+}
+
+static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pud++) {
+ pmd_t *pmd, *pmd_base;
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (kasan_pmd_table(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+ continue;
+ }
+ }
+ pmd = pmd_offset(pud, addr);
+ pmd_base = pmd_offset(pud, 0);
+ kasan_remove_pmd_table(pmd, addr, next);
+ kasan_free_pmd(pmd_base, pud);
+ }
+}
+
+static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, p4d++) {
+ pud_t *pud;
+
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ if (kasan_pud_table(*p4d)) {
+ if (IS_ALIGNED(addr, P4D_SIZE) &&
+ IS_ALIGNED(next, P4D_SIZE)) {
+ p4d_clear(p4d);
+ continue;
+ }
+ }
+ pud = pud_offset(p4d, addr);
+ kasan_remove_pud_table(pud, addr, next);
+ kasan_free_pud(pud_offset(p4d, 0), p4d);
+ }
+}
+
+void kasan_remove_zero_shadow(void *start, unsigned long size)
+{
+ unsigned long addr, end, next;
+ pgd_t *pgd;
+
+ addr = (unsigned long)kasan_mem_to_shadow(start);
+ end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d;
+
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (kasan_p4d_table(*pgd)) {
+ if (IS_ALIGNED(addr, PGDIR_SIZE) &&
+ IS_ALIGNED(next, PGDIR_SIZE)) {
+ pgd_clear(pgd);
+ continue;
+ }
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ kasan_remove_p4d_table(p4d, addr, next);
+ kasan_free_p4d(p4d_offset(pgd, 0), pgd);
+ }
+}
+
+int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ int ret;
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(start);
+ shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return -EINVAL;
+
+ ret = kasan_populate_early_shadow(shadow_start, shadow_end);
+ if (ret)
+ kasan_remove_zero_shadow(start, size);
+ return ret;
+}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000..ac4994567
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,299 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+#include <linux/stackdepot.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+
+#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
+#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
+#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+
+#ifdef CONFIG_KASAN_GENERIC
+#define KASAN_FREE_PAGE 0xFF /* page was freed */
+#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
+#else
+#define KASAN_FREE_PAGE KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#endif
+
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
+
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT 0xF1
+#define KASAN_STACK_MID 0xF2
+#define KASAN_STACK_RIGHT 0xF3
+#define KASAN_STACK_PARTIAL 0xF4
+
+/*
+ * alloca redzone shadow values
+ */
+#define KASAN_ALLOCA_LEFT 0xCA
+#define KASAN_ALLOCA_RIGHT 0xCB
+
+#define KASAN_ALLOCA_REDZONE_SIZE 32
+
+/*
+ * Stack frame marker (compiler ABI).
+ */
+#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3
+
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+
+struct kasan_access_info {
+ const void *access_addr;
+ const void *first_bad_addr;
+ size_t access_size;
+ bool is_write;
+ unsigned long ip;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+ const char *filename;
+ int line_no;
+ int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+ const void *beg; /* Address of the beginning of the global variable. */
+ size_t size; /* Size of the global variable. */
+ size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
+ const void *name;
+ const void *module_name; /* Name of the module where the global variable is declared. */
+ unsigned long has_dynamic_init; /* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+ struct kasan_source_location *location;
+#endif
+#if KASAN_ABI_VERSION >= 5
+ char *odr_indicator;
+#endif
+};
+
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
+struct kasan_alloc_meta {
+ struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The free stack is stored into struct kasan_free_meta.
+ */
+ depot_stack_handle_t aux_stack[2];
+#else
+ struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#endif
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+ u8 free_track_idx;
+#endif
+};
+
+struct qlist_node {
+ struct qlist_node *next;
+};
+struct kasan_free_meta {
+ /* This field is used while the object is in the quarantine.
+ * Otherwise it might be used for the allocator freelist.
+ */
+ struct qlist_node quarantine_link;
+#ifdef CONFIG_KASAN_GENERIC
+ struct kasan_track free_track;
+#endif
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+ return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+ << KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool addr_has_shadow(const void *addr)
+{
+ return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+void kasan_poison_shadow(const void *address, size_t size, u8 value);
+
+/**
+ * check_memory_region - Check memory region, and report if invalid access.
+ * @addr: the accessed address
+ * @size: the accessed size
+ * @write: true if access is a write access
+ * @ret_ip: return address
+ * @return: true if access was valid, false if invalid
+ */
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip);
+
+void *find_first_bad_addr(void *addr, size_t size);
+const char *get_bug_type(struct kasan_access_info *info);
+
+bool kasan_report(unsigned long addr, size_t size,
+ bool is_write, unsigned long ip);
+void kasan_report_invalid_free(void *object, unsigned long ip);
+
+struct page *kasan_addr_to_page(const void *addr);
+
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag);
+
+#if defined(CONFIG_KASAN_GENERIC) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+ struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
+#ifdef CONFIG_KASAN_SW_TAGS
+
+void print_tags(u8 addr_tag, const void *addr);
+
+u8 random_tag(void);
+
+#else
+
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+
+static inline u8 random_tag(void)
+{
+ return 0;
+}
+
+#endif
+
+#ifndef arch_kasan_set_tag
+static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+{
+ return addr;
+}
+#endif
+#ifndef arch_kasan_reset_tag
+#define arch_kasan_reset_tag(addr) ((void *)(addr))
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr) 0
+#endif
+
+#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
+#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
+#define get_tag(addr) arch_kasan_get_tag(addr)
+
+/*
+ * Exported functions for interfaces called from assembly or from generated
+ * code. Declarations here to avoid warning about missing declarations.
+ */
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
+void __asan_register_globals(struct kasan_global *globals, size_t size);
+void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_handle_no_return(void);
+void __asan_alloca_poison(unsigned long addr, size_t size);
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
+
+void __asan_load1(unsigned long addr);
+void __asan_store1(unsigned long addr);
+void __asan_load2(unsigned long addr);
+void __asan_store2(unsigned long addr);
+void __asan_load4(unsigned long addr);
+void __asan_store4(unsigned long addr);
+void __asan_load8(unsigned long addr);
+void __asan_store8(unsigned long addr);
+void __asan_load16(unsigned long addr);
+void __asan_store16(unsigned long addr);
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+
+void __asan_load1_noabort(unsigned long addr);
+void __asan_store1_noabort(unsigned long addr);
+void __asan_load2_noabort(unsigned long addr);
+void __asan_store2_noabort(unsigned long addr);
+void __asan_load4_noabort(unsigned long addr);
+void __asan_store4_noabort(unsigned long addr);
+void __asan_load8_noabort(unsigned long addr);
+void __asan_store8_noabort(unsigned long addr);
+void __asan_load16_noabort(unsigned long addr);
+void __asan_store16_noabort(unsigned long addr);
+void __asan_loadN_noabort(unsigned long addr, size_t size);
+void __asan_storeN_noabort(unsigned long addr, size_t size);
+
+void __asan_report_load1_noabort(unsigned long addr);
+void __asan_report_store1_noabort(unsigned long addr);
+void __asan_report_load2_noabort(unsigned long addr);
+void __asan_report_store2_noabort(unsigned long addr);
+void __asan_report_load4_noabort(unsigned long addr);
+void __asan_report_store4_noabort(unsigned long addr);
+void __asan_report_load8_noabort(unsigned long addr);
+void __asan_report_store8_noabort(unsigned long addr);
+void __asan_report_load16_noabort(unsigned long addr);
+void __asan_report_store16_noabort(unsigned long addr);
+void __asan_report_load_n_noabort(unsigned long addr, size_t size);
+void __asan_report_store_n_noabort(unsigned long addr, size_t size);
+
+void __asan_set_shadow_00(const void *addr, size_t size);
+void __asan_set_shadow_f1(const void *addr, size_t size);
+void __asan_set_shadow_f2(const void *addr, size_t size);
+void __asan_set_shadow_f3(const void *addr, size_t size);
+void __asan_set_shadow_f5(const void *addr, size_t size);
+void __asan_set_shadow_f8(const void *addr, size_t size);
+
+void __hwasan_load1_noabort(unsigned long addr);
+void __hwasan_store1_noabort(unsigned long addr);
+void __hwasan_load2_noabort(unsigned long addr);
+void __hwasan_store2_noabort(unsigned long addr);
+void __hwasan_load4_noabort(unsigned long addr);
+void __hwasan_store4_noabort(unsigned long addr);
+void __hwasan_load8_noabort(unsigned long addr);
+void __hwasan_store8_noabort(unsigned long addr);
+void __hwasan_load16_noabort(unsigned long addr);
+void __hwasan_store16_noabort(unsigned long addr);
+void __hwasan_loadN_noabort(unsigned long addr, size_t size);
+void __hwasan_storeN_noabort(unsigned long addr, size_t size);
+
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
+
+#endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000..622193846
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/cpuhotplug.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+ struct qlist_node *head;
+ struct qlist_node *tail;
+ size_t bytes;
+ bool offline;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+ return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+ q->head = q->tail = NULL;
+ q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+ size_t size)
+{
+ if (unlikely(qlist_empty(q)))
+ q->head = qlink;
+ else
+ q->tail->next = qlink;
+ q->tail = qlink;
+ qlink->next = NULL;
+ q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ if (qlist_empty(to)) {
+ *to = *from;
+ qlist_init(from);
+ return;
+ }
+
+ to->tail->next = from->head;
+ to->tail = from->tail;
+ to->bytes += from->bytes;
+
+ qlist_init(from);
+}
+
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+ (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
+static DEFINE_RAW_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+ return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ struct kasan_free_meta *free_info =
+ container_of(qlink, struct kasan_free_meta,
+ quarantine_link);
+
+ return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+ void *object = qlink_to_object(qlink, cache);
+ unsigned long flags;
+
+ if (IS_ENABLED(CONFIG_SLAB))
+ local_irq_save(flags);
+
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
+ ___cache_free(cache, object, _THIS_IP_);
+
+ if (IS_ENABLED(CONFIG_SLAB))
+ local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+ struct qlist_node *qlink;
+
+ if (unlikely(qlist_empty(q)))
+ return;
+
+ qlink = q->head;
+ while (qlink) {
+ struct kmem_cache *obj_cache =
+ cache ? cache : qlink_to_cache(qlink);
+ struct qlist_node *next = qlink->next;
+
+ qlink_free(qlink, obj_cache);
+ qlink = next;
+ }
+ qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+ unsigned long flags;
+ struct qlist_head *q;
+ struct qlist_head temp = QLIST_INIT;
+
+ /*
+ * Note: irq must be disabled until after we move the batch to the
+ * global quarantine. Otherwise quarantine_remove_cache() can miss
+ * some objects belonging to the cache if they are in our local temp
+ * list. quarantine_remove_cache() executes on_each_cpu() at the
+ * beginning which ensures that it either sees the objects in per-cpu
+ * lists or in the global quarantine.
+ */
+ local_irq_save(flags);
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ if (q->offline) {
+ local_irq_restore(flags);
+ return;
+ }
+ qlist_put(q, &info->quarantine_link, cache->size);
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
+ qlist_move_all(q, &temp);
+
+ raw_spin_lock(&quarantine_lock);
+ WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+ qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+ if (global_quarantine[quarantine_tail].bytes >=
+ READ_ONCE(quarantine_batch_size)) {
+ int new_tail;
+
+ new_tail = quarantine_tail + 1;
+ if (new_tail == QUARANTINE_BATCHES)
+ new_tail = 0;
+ if (new_tail != quarantine_head)
+ quarantine_tail = new_tail;
+ }
+ raw_spin_unlock(&quarantine_lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+void quarantine_reduce(void)
+{
+ size_t total_size, new_quarantine_size, percpu_quarantines;
+ unsigned long flags;
+ int srcu_idx;
+ struct qlist_head to_free = QLIST_INIT;
+
+ if (likely(READ_ONCE(quarantine_size) <=
+ READ_ONCE(quarantine_max_size)))
+ return;
+
+ /*
+ * srcu critical section ensures that quarantine_remove_cache()
+ * will not miss objects belonging to the cache while they are in our
+ * local to_free list. srcu is chosen because (1) it gives us private
+ * grace period domain that does not interfere with anything else,
+ * and (2) it allows synchronize_srcu() to return without waiting
+ * if there are no pending read critical sections (which is the
+ * expected case).
+ */
+ srcu_idx = srcu_read_lock(&remove_cache_srcu);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
+
+ /*
+ * Update quarantine size in case of hotplug. Allocate a fraction of
+ * the installed memory to quarantine minus per-cpu queue limits.
+ */
+ total_size = (totalram_pages() << PAGE_SHIFT) /
+ QUARANTINE_FRACTION;
+ percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
+ new_quarantine_size = (total_size < percpu_quarantines) ?
+ 0 : total_size - percpu_quarantines;
+ WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+ /* Aim at consuming at most 1/2 of slots in quarantine. */
+ WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+ 2 * total_size / QUARANTINE_BATCHES));
+
+ if (likely(quarantine_size > quarantine_max_size)) {
+ qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+ WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+ quarantine_head++;
+ if (quarantine_head == QUARANTINE_BATCHES)
+ quarantine_head = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, NULL);
+ srcu_read_unlock(&remove_cache_srcu, srcu_idx);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+ struct qlist_head *to,
+ struct kmem_cache *cache)
+{
+ struct qlist_node *curr;
+
+ if (unlikely(qlist_empty(from)))
+ return;
+
+ curr = from->head;
+ qlist_init(from);
+ while (curr) {
+ struct qlist_node *next = curr->next;
+ struct kmem_cache *obj_cache = qlink_to_cache(curr);
+
+ if (obj_cache == cache)
+ qlist_put(to, curr, obj_cache->size);
+ else
+ qlist_put(from, curr, obj_cache->size);
+
+ curr = next;
+ }
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+ struct kmem_cache *cache = arg;
+ struct qlist_head to_free = QLIST_INIT;
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ /*
+ * Ensure the ordering between the writing to q->offline and
+ * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted
+ * by interrupt.
+ */
+ if (READ_ONCE(q->offline))
+ return;
+ qlist_move_cache(q, &to_free, cache);
+ qlist_free_all(&to_free, cache);
+}
+
+/* Free all quarantined objects belonging to cache. */
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+ unsigned long flags, i;
+ struct qlist_head to_free = QLIST_INIT;
+
+ /*
+ * Must be careful to not miss any objects that are being moved from
+ * per-cpu list to the global quarantine in quarantine_put(),
+ * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * achieves the first goal, while synchronize_srcu() achieves the
+ * second.
+ */
+ on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
+ for (i = 0; i < QUARANTINE_BATCHES; i++) {
+ if (qlist_empty(&global_quarantine[i]))
+ continue;
+ qlist_move_cache(&global_quarantine[i], &to_free, cache);
+ /* Scanning whole quarantine can take a while. */
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
+ cond_resched();
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
+ }
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
+
+ qlist_free_all(&to_free, cache);
+
+ synchronize_srcu(&remove_cache_srcu);
+}
+
+static int kasan_cpu_online(unsigned int cpu)
+{
+ this_cpu_ptr(&cpu_quarantine)->offline = false;
+ return 0;
+}
+
+static int kasan_cpu_offline(unsigned int cpu)
+{
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ /* Ensure the ordering between the writing to q->offline and
+ * qlist_free_all. Otherwise, cpu_quarantine may be corrupted
+ * by interrupt.
+ */
+ WRITE_ONCE(q->offline, true);
+ barrier();
+ qlist_free_all(q, NULL);
+ return 0;
+}
+
+static int __init kasan_cpu_quarantine_init(void)
+{
+ int ret = 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online",
+ kasan_cpu_online, kasan_cpu_offline);
+ if (ret < 0)
+ pr_err("kasan cpu quarantine register failed [%d]\n", ret);
+ return ret;
+}
+late_initcall(kasan_cpu_quarantine_init);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000..98b08807c
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common generic and tag-based KASAN error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+#include <linux/sched/task_stack.h>
+#include <linux/uaccess.h>
+
+#include <asm/sections.h>
+
+#include <kunit/test.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
+
+bool kasan_save_enable_multi_shot(void)
+{
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ pr_err("BUG: KASAN: %s in %pS\n",
+ get_bug_type(info), (void *)info->ip);
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+static void start_report(unsigned long *flags)
+{
+ /*
+ * Make sure we don't end up in loop.
+ */
+ kasan_disable_current();
+ spin_lock_irqsave(&report_lock, *flags);
+ pr_err("==================================================================\n");
+}
+
+static void end_report(unsigned long *flags)
+{
+ pr_err("==================================================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ spin_unlock_irqrestore(&report_lock, *flags);
+ if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ check_panic_on_warn("KASAN");
+ kasan_enable_current();
+}
+
+static void print_stack(depot_stack_handle_t stack)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+}
+
+static void print_track(struct kasan_track *track, const char *prefix)
+{
+ pr_err("%s by task %u:\n", prefix, track->pid);
+ if (track->stack) {
+ print_stack(track->stack);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+}
+
+struct page *kasan_addr_to_page(const void *addr)
+{
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_head_page(addr);
+ return NULL;
+}
+
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+ const void *addr)
+{
+ unsigned long access_addr = (unsigned long)addr;
+ unsigned long object_addr = (unsigned long)object;
+ const char *rel_type;
+ int rel_bytes;
+
+ pr_err("The buggy address belongs to the object at %px\n"
+ " which belongs to the cache %s of size %d\n",
+ object, cache->name, cache->object_size);
+
+ if (!addr)
+ return;
+
+ if (access_addr < object_addr) {
+ rel_type = "to the left";
+ rel_bytes = object_addr - access_addr;
+ } else if (access_addr >= object_addr + cache->object_size) {
+ rel_type = "to the right";
+ rel_bytes = access_addr - (object_addr + cache->object_size);
+ } else {
+ rel_type = "inside";
+ rel_bytes = access_addr - object_addr;
+ }
+
+ pr_err("The buggy address is located %d bytes %s of\n"
+ " %d-byte region [%px, %px)\n",
+ rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+ (void *)(object_addr + cache->object_size));
+}
+
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_track *free_track;
+
+ print_track(&alloc_info->alloc_track, "Allocated");
+ pr_err("\n");
+ free_track = kasan_get_free_track(cache, object, tag);
+ if (free_track) {
+ print_track(free_track, "Freed");
+ pr_err("\n");
+ }
+
+#ifdef CONFIG_KASAN_GENERIC
+ if (alloc_info->aux_stack[0]) {
+ pr_err("Last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_info->aux_stack[1]) {
+ pr_err("Second to last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
+ }
+
+ describe_object_addr(cache, object, addr);
+}
+
+static inline bool kernel_or_module_addr(const void *addr)
+{
+ if (addr >= (void *)_stext && addr < (void *)_end)
+ return true;
+ if (is_module_address((unsigned long)addr))
+ return true;
+ return false;
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+ return addr >= (void *)&init_thread_union.stack &&
+ (addr <= (void *)&init_thread_union.stack +
+ sizeof(init_thread_union.stack));
+}
+
+static bool __must_check tokenize_frame_descr(const char **frame_descr,
+ char *token, size_t max_tok_len,
+ unsigned long *value)
+{
+ const char *sep = strchr(*frame_descr, ' ');
+
+ if (sep == NULL)
+ sep = *frame_descr + strlen(*frame_descr);
+
+ if (token != NULL) {
+ const size_t tok_len = sep - *frame_descr;
+
+ if (tok_len + 1 > max_tok_len) {
+ pr_err("KASAN internal error: frame description too long: %s\n",
+ *frame_descr);
+ return false;
+ }
+
+ /* Copy token (+ 1 byte for '\0'). */
+ strlcpy(token, *frame_descr, tok_len + 1);
+ }
+
+ /* Advance frame_descr past separator. */
+ *frame_descr = sep + 1;
+
+ if (value != NULL && kstrtoul(token, 10, value)) {
+ pr_err("KASAN internal error: not a valid number: %s\n", token);
+ return false;
+ }
+
+ return true;
+}
+
+static void print_decoded_frame_descr(const char *frame_descr)
+{
+ /*
+ * We need to parse the following string:
+ * "n alloc_1 alloc_2 ... alloc_n"
+ * where alloc_i looks like
+ * "offset size len name"
+ * or "offset size len name:line".
+ */
+
+ char token[64];
+ unsigned long num_objects;
+
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &num_objects))
+ return;
+
+ pr_err("\n");
+ pr_err("this frame has %lu %s:\n", num_objects,
+ num_objects == 1 ? "object" : "objects");
+
+ while (num_objects--) {
+ unsigned long offset;
+ unsigned long size;
+
+ /* access offset */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &offset))
+ return;
+ /* access size */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &size))
+ return;
+ /* name length (unused) */
+ if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
+ return;
+ /* object name */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ NULL))
+ return;
+
+ /* Strip line number; without filename it's not very helpful. */
+ strreplace(token, ':', '\0');
+
+ /* Finally, print object information. */
+ pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
+ }
+}
+
+static bool __must_check get_address_stack_frame_info(const void *addr,
+ unsigned long *offset,
+ const char **frame_descr,
+ const void **frame_pc)
+{
+ unsigned long aligned_addr;
+ unsigned long mem_ptr;
+ const u8 *shadow_bottom;
+ const u8 *shadow_ptr;
+ const unsigned long *frame;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
+
+ /*
+ * NOTE: We currently only support printing frame information for
+ * accesses to the task's own stack.
+ */
+ if (!object_is_on_stack(addr))
+ return false;
+
+ aligned_addr = round_down((unsigned long)addr, sizeof(long));
+ mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
+ shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
+ shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ }
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ }
+
+ if (shadow_ptr < shadow_bottom)
+ return false;
+
+ frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
+ if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
+ pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ frame[0]);
+ return false;
+ }
+
+ *offset = (unsigned long)addr - (unsigned long)frame;
+ *frame_descr = (const char *)frame[1];
+ *frame_pc = (void *)frame[2];
+
+ return true;
+}
+
+static void print_address_stack_frame(const void *addr)
+{
+ unsigned long offset;
+ const char *frame_descr;
+ const void *frame_pc;
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return;
+
+ if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
+ &frame_pc))
+ return;
+
+ /*
+ * get_address_stack_frame_info only returns true if the given addr is
+ * on the current task's stack.
+ */
+ pr_err("\n");
+ pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
+ addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" %pS\n", frame_pc);
+
+ if (!frame_descr)
+ return;
+
+ print_decoded_frame_descr(frame_descr);
+}
+
+static void print_address_description(void *addr, u8 tag)
+{
+ struct page *page = kasan_addr_to_page(addr);
+
+ dump_stack();
+ pr_err("\n");
+
+ if (page && PageSlab(page)) {
+ struct kmem_cache *cache = page->slab_cache;
+ void *object = nearest_obj(cache, page, addr);
+
+ describe_object(cache, object, addr, tag);
+ }
+
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ }
+
+ if (page) {
+ pr_err("The buggy address belongs to the page:\n");
+ dump_page(page, "kasan: bad access detected");
+ }
+
+ print_address_stack_frame(addr);
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+ return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+ /* The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG/8)*2 chars.
+ */
+ return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+ (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+ int i;
+ const void *shadow = kasan_mem_to_shadow(addr);
+ const void *shadow_row;
+
+ shadow_row = (void *)round_down((unsigned long)shadow,
+ SHADOW_BYTES_PER_ROW)
+ - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+ pr_err("Memory state around the buggy address:\n");
+
+ for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+ const void *kaddr = kasan_shadow_to_mem(shadow_row);
+ char buffer[4 + (BITS_PER_LONG/8)*2];
+ char shadow_buf[SHADOW_BYTES_PER_ROW];
+
+ snprintf(buffer, sizeof(buffer),
+ (i == 0) ? ">%px: " : " %px: ", kaddr);
+ /*
+ * We should not pass a shadow pointer to generic
+ * function, because generic functions may try to
+ * access kasan mapping for the passed address.
+ */
+ memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
+ print_hex_dump(KERN_ERR, buffer,
+ DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+ shadow_buf, SHADOW_BYTES_PER_ROW, 0);
+
+ if (row_is_guilty(shadow_row, shadow))
+ pr_err("%*c\n",
+ shadow_pointer_offset(shadow_row, shadow),
+ '^');
+
+ shadow_row += SHADOW_BYTES_PER_ROW;
+ }
+}
+
+static bool report_enabled(void)
+{
+ if (current->kasan_depth)
+ return false;
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
+#if IS_ENABLED(CONFIG_KUNIT)
+static void kasan_update_kunit_status(struct kunit *cur_test)
+{
+ struct kunit_resource *resource;
+ struct kunit_kasan_expectation *kasan_data;
+
+ resource = kunit_find_named_resource(cur_test, "kasan_data");
+
+ if (!resource) {
+ kunit_set_failure(cur_test);
+ return;
+ }
+
+ kasan_data = (struct kunit_kasan_expectation *)resource->data;
+ kasan_data->report_found = true;
+ kunit_put_resource(resource);
+}
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
+void kasan_report_invalid_free(void *object, unsigned long ip)
+{
+ unsigned long flags;
+ u8 tag = get_tag(object);
+
+ object = reset_tag(object);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
+ start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
+ print_tags(tag, object);
+ pr_err("\n");
+ print_address_description(object, tag);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ end_report(&flags);
+}
+
+static void __kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ struct kasan_access_info info;
+ void *tagged_addr;
+ void *untagged_addr;
+ unsigned long flags;
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
+ disable_trace_on_warning();
+
+ tagged_addr = (void *)addr;
+ untagged_addr = reset_tag(tagged_addr);
+
+ info.access_addr = tagged_addr;
+ if (addr_has_shadow(untagged_addr))
+ info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+ else
+ info.first_bad_addr = untagged_addr;
+ info.access_size = size;
+ info.is_write = is_write;
+ info.ip = ip;
+
+ start_report(&flags);
+
+ print_error_description(&info);
+ if (addr_has_shadow(untagged_addr))
+ print_tags(get_tag(tagged_addr), info.first_bad_addr);
+ pr_err("\n");
+
+ if (addr_has_shadow(untagged_addr)) {
+ print_address_description(untagged_addr, get_tag(tagged_addr));
+ pr_err("\n");
+ print_shadow_for_address(info.first_bad_addr);
+ } else {
+ dump_stack();
+ }
+
+ end_report(&flags);
+}
+
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ bool ret = false;
+
+ if (likely(report_enabled())) {
+ __kasan_report(addr, size, is_write, ip);
+ ret = true;
+ }
+
+ user_access_restore(flags);
+
+ return ret;
+}
+
+/*
+ * With CONFIG_KASAN, accesses to bogus pointers (outside the high
+ * canonical half of the address space) cause out-of-bounds shadow memory reads
+ * before the actual access. For addresses in the low canonical half of the
+ * address space, as well as most non-canonical addresses, that out-of-bounds
+ * shadow memory access lands in the non-canonical part of the address space.
+ * Help the user figure out what the original bogus pointer was.
+ */
+void kasan_non_canonical_hook(unsigned long addr)
+{
+ unsigned long orig_addr;
+ const char *bug_type;
+
+ if (addr < KASAN_SHADOW_OFFSET)
+ return;
+
+ orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
+ /*
+ * For faults near the shadow address for NULL, we can be fairly certain
+ * that this is a KASAN shadow memory access.
+ * For faults that correspond to shadow for low canonical addresses, we
+ * can still be pretty sure - that shadow region is a fairly narrow
+ * chunk of the non-canonical address space.
+ * But faults that look like shadow for non-canonical addresses are a
+ * really large chunk of the address space. In that case, we still
+ * print the decoded address, but make it clear that this is not
+ * necessarily what's actually going on.
+ */
+ if (orig_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if (orig_addr < TASK_SIZE)
+ bug_type = "probably user-memory-access";
+ else
+ bug_type = "maybe wild-memory-access";
+ pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
+ orig_addr, orig_addr + KASAN_SHADOW_MASK);
+}
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 000000000..e02a36a51
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+static DEFINE_PER_CPU(u32, prng_state);
+
+void kasan_init_tags(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(prng_state, cpu) = (u32)get_cycles();
+}
+
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since tag-based KASAN is meant to be used a probabilistic
+ * bug-detection debug feature, this doesn't have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+u8 random_tag(void)
+{
+ u32 state = this_cpu_read(prng_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(prng_state, state);
+
+ return (u8)(state % (KASAN_TAG_MAX + 1));
+}
+
+void *kasan_reset_tag(const void *addr)
+{
+ return reset_tag(addr);
+}
+
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ u8 tag;
+ u8 *shadow_first, *shadow_last, *shadow;
+ void *untagged_addr;
+
+ if (unlikely(size == 0))
+ return true;
+
+ if (unlikely(addr + size < addr))
+ return !kasan_report(addr, size, write, ret_ip);
+
+ tag = get_tag((const void *)addr);
+
+ /*
+ * Ignore accesses for pointers tagged with 0xff (native kernel
+ * pointer tag) to suppress false positives caused by kmap.
+ *
+ * Some kernel code was written to account for archs that don't keep
+ * high memory mapped all the time, but rather map and unmap particular
+ * pages when needed. Instead of storing a pointer to the kernel memory,
+ * this code saves the address of the page structure and offset within
+ * that page for later use. Those pages are then mapped and unmapped
+ * with kmap/kunmap when necessary and virt_to_page is used to get the
+ * virtual address of the page. For arm64 (that keeps the high memory
+ * mapped all the time), kmap is turned into a page_address call.
+
+ * The issue is that with use of the page_address + virt_to_page
+ * sequence the top byte value of the original pointer gets lost (gets
+ * set to KASAN_TAG_KERNEL (0xFF)).
+ */
+ if (tag == KASAN_TAG_KERNEL)
+ return true;
+
+ untagged_addr = reset_tag((const void *)addr);
+ if (unlikely(untagged_addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ return !kasan_report(addr, size, write, ret_ip);
+ }
+ shadow_first = kasan_mem_to_shadow(untagged_addr);
+ shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+ for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+ if (*shadow != tag) {
+ return !kasan_report(addr, size, write, ret_ip);
+ }
+ }
+
+ return true;
+}
+
+#define DEFINE_HWASAN_LOAD_STORE(size) \
+ void __hwasan_load##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, false, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
+ void __hwasan_store##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_store##size##_noabort)
+
+DEFINE_HWASAN_LOAD_STORE(1);
+DEFINE_HWASAN_LOAD_STORE(2);
+DEFINE_HWASAN_LOAD_STORE(4);
+DEFINE_HWASAN_LOAD_STORE(8);
+DEFINE_HWASAN_LOAD_STORE(16);
+
+void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_loadN_noabort);
+
+void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_storeN_noabort);
+
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+{
+ kasan_poison_shadow((void *)addr, size, tag);
+}
+EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
new file mode 100644
index 000000000..bee43717d
--- /dev/null
+++ b/mm/kasan/tags_report.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = get_alloc_info(cache, object);
+
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ return "out-of-bounds";
+ }
+
+#endif
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ u8 tag = get_tag(addr);
+ void *p = reset_tag(addr);
+ void *end = p + size;
+
+ while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
new file mode 100644
index 000000000..28e18777e
--- /dev/null
+++ b/mm/khugepaged.c
@@ -0,0 +1,2400 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_EXCEED_SWAP_PTE,
+ SCAN_EXCEED_SHARED_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PTE_UFFD_WP,
+ SCAN_PAGE_RO,
+ SCAN_LACK_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL,
+ SCAN_TRUNCATED,
+ SCAN_PAGE_HAS_PRIVATE,
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static unsigned long khugepaged_sleep_expire;
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly;
+static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
+
+#define MM_SLOTS_HASH_BITS 10
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+#define MAX_PTE_MAPPED_THP 8
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+
+ /* pte-mapped THP in this mm */
+ int nr_pte_mapped_thp;
+ unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+};
+
+static struct khugepaged_scan khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ khugepaged_sleep_expire = 0;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = kstrtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_hugepage_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_hugepage_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = kstrtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_swap;
+
+ err = kstrtoul(buf, 10, &max_ptes_swap);
+ if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_swap = max_ptes_swap;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+ __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+ khugepaged_max_ptes_swap_store);
+
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_shared;
+
+ err = kstrtoul(buf, 10, &max_ptes_shared);
+ if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_shared = max_ptes_shared;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+ __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+ khugepaged_max_ptes_shared_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
+ &khugepaged_max_ptes_shared_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+ /*
+ * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+ * can't handle this properly after s390_enable_sie, so we simply
+ * ignore the madvise to prevent qemu from causing a SIGSEGV.
+ */
+ if (mm_has_pgste(vma->vm_mm))
+ return 0;
+#endif
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
+ khugepaged_enter_vma_merge(vma, *vm_flags))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+int __init khugepaged_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+ khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
+
+ return 0;
+}
+
+void __init khugepaged_destroy(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+ if (mm == mm_slot->mm)
+ return mm_slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+static bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ if (!transhuge_vma_enabled(vma, vm_flags))
+ return false;
+
+ if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+ vma->vm_pgoff, HPAGE_PMD_NR))
+ return false;
+
+ /* Enabled via shmem mount options or sysfs settings. */
+ if (shmem_file(vma->vm_file))
+ return shmem_huge_enabled(vma);
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ return false;
+
+ /* Only regular file is valid */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ (vm_flags & VM_DENYWRITE)) {
+ struct inode *inode = vma->vm_file->f_inode;
+
+ return S_ISREG(inode->i_mode);
+ }
+
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (vma_is_temporary_stack(vma))
+ return false;
+ return !(vm_flags & VM_NO_KHUGEPAGED);
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ mmgrab(mm);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ unsigned long hstart, hend;
+
+ /*
+ * khugepaged only supports read-only files for non-shmem files.
+ * khugepaged does not yet work on special mappings. And
+ * file-private shmem THP is not supported.
+ */
+ if (!hugepage_vma_check(vma, vm_flags))
+ return 0;
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma, vm_flags);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ if (free) {
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_lock.
+ */
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
+ }
+}
+
+static void release_pte_page(struct page *page)
+{
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ -compound_nr(page));
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte,
+ struct list_head *compound_pagelist)
+{
+ struct page *page, *tmp;
+
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+
+ page = pte_page(pteval);
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
+ !PageCompound(page))
+ release_pte_page(page);
+ }
+
+ list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
+ list_del(&page->lru);
+ release_pte_page(page);
+ }
+}
+
+static bool is_refcount_suitable(struct page *page)
+{
+ int expected_refcount;
+
+ expected_refcount = total_mapcount(page);
+ if (PageSwapCache(page))
+ expected_refcount += compound_nr(page);
+
+ return page_count(page) == expected_refcount;
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte,
+ struct list_head *compound_pagelist)
+{
+ struct page *page = NULL;
+ pte_t *_pte;
+ int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+ bool writable = false;
+
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval) || (pte_present(pteval) &&
+ is_zero_pfn(pte_pfn(pteval)))) {
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto out;
+ }
+ }
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto out;
+ }
+ if (pte_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ goto out;
+ }
+
+ if (PageCompound(page)) {
+ struct page *p;
+ page = compound_head(page);
+
+ /*
+ * Check if we have dealt with the compound page
+ * already
+ */
+ list_for_each_entry(p, compound_pagelist, lru) {
+ if (page == p)
+ goto next;
+ }
+ }
+
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out;
+ }
+
+ /*
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * an additinal pin on the page.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not from this process. The other process cannot write to
+ * the page, only trigger CoW.
+ */
+ if (!is_refcount_suitable(page)) {
+ unlock_page(page);
+ result = SCAN_PAGE_COUNT;
+ goto out;
+ }
+ if (!pte_write(pteval) && PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
+ /*
+ * Page is in the swap cache and cannot be re-used.
+ * It cannot be collapsed into a THP.
+ */
+ unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
+ goto out;
+ }
+
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
+ goto out;
+ }
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ compound_nr(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ if (PageCompound(page))
+ list_add_tail(&page->lru, compound_pagelist);
+next:
+ /* There should be enough young pte to collapse the page */
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced++;
+
+ if (pte_write(pteval))
+ writable = true;
+ }
+
+ if (unlikely(!writable)) {
+ result = SCAN_PAGE_RO;
+ } else if (unlikely(!referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+out:
+ release_pte_pages(pte, _pte, compound_pagelist);
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 0;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
+{
+ struct page *src_page, *tmp;
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, page++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ if (!PageCompound(src_page))
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page, false);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+ }
+
+ list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
+ list_del(&src_page->lru);
+ release_pte_page(src_page);
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+
+ add_wait_queue(&khugepaged_wait, &wait);
+ freezable_schedule_timeout_interruptible(
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+static int khugepaged_node_load[MAX_NUMNODES];
+
+static bool khugepaged_scan_abort(int nid)
+{
+ int i;
+
+ /*
+ * If node_reclaim_mode is disabled, then no extra effort is made to
+ * allocate memory locally.
+ */
+ if (!node_reclaim_mode)
+ return false;
+
+ /* If there is a count for this node already, it must be acceptable */
+ if (khugepaged_node_load[nid])
+ return false;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (!khugepaged_node_load[i])
+ continue;
+ if (node_distance(nid, i) > node_reclaim_distance)
+ return true;
+ }
+ return false;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
+{
+ return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
+}
+
+#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+{
+ VM_BUG_ON_PAGE(*hpage, *hpage);
+
+ *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+
+ prep_transhuge_page(*hpage);
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ return *hpage;
+}
+#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_khugepaged_hugepage(void)
+{
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
+}
+
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_khugepaged_hugepage();
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+{
+ VM_BUG_ON(!*hpage);
+
+ return *hpage;
+}
+#endif
+
+/*
+ * If mmap_lock temporarily dropped, revalidate vma
+ * before taking mmap_lock.
+ * Return 0 if succeeds, otherwise return none-zero
+ * value (scan code).
+ */
+
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+ struct vm_area_struct **vmap)
+{
+ struct vm_area_struct *vma;
+ unsigned long hstart, hend;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ return SCAN_ANY_PROCESS;
+
+ *vmap = vma = find_vma(mm, address);
+ if (!vma)
+ return SCAN_VMA_NULL;
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ return SCAN_ADDRESS_RANGE;
+ if (!hugepage_vma_check(vma, vma->vm_flags))
+ return SCAN_VMA_CHECK;
+ /* Anon VMA expected */
+ if (!vma->anon_vma || vma->vm_ops)
+ return SCAN_VMA_CHECK;
+ return 0;
+}
+
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_lock held to protect against vma changes.
+ */
+
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ int referenced)
+{
+ int swapped_in = 0;
+ vm_fault_t ret = 0;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .flags = FAULT_FLAG_ALLOW_RETRY,
+ .pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
+ };
+
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
+ continue;
+ swapped_in++;
+ ret = do_swap_page(&vmf);
+
+ /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
+ if (ret & VM_FAULT_RETRY) {
+ mmap_read_lock(mm);
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
+ /* vma is no longer available, don't continue to swapin */
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ /* check if the pmd is still valid */
+ if (mm_find_pmd(mm, address) != pmd) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ }
+ if (ret & VM_FAULT_ERROR) {
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
+ }
+ /* pte is unmapped now, we need to map it */
+ vmf.pte = pte_offset_map(pmd, vmf.address);
+ }
+ vmf.pte--;
+ pte_unmap(vmf.pte);
+
+ /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (swapped_in)
+ lru_add_drain();
+
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
+ return true;
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ int node, int referenced, int unmapped)
+{
+ LIST_HEAD(compound_pagelist);
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *pmd_ptl, *pte_ptl;
+ int isolated = 0, result = 0;
+ struct vm_area_struct *vma;
+ struct mmu_notifier_range range;
+ gfp_t gfp;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+
+ /*
+ * Before allocating the hugepage, release the mmap_lock read lock.
+ * The allocation can take potentially a long time if it involves
+ * sync compaction, and we do not need to hold the mmap_lock during
+ * that. We will recheck the vma after taking it again in write mode.
+ */
+ mmap_read_unlock(mm);
+ new_page = khugepaged_alloc_page(hpage, gfp, node);
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
+
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
+
+ mmap_read_lock(mm);
+ result = hugepage_vma_revalidate(mm, address, &vma);
+ if (result) {
+ mmap_read_unlock(mm);
+ goto out_nolock;
+ }
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
+ mmap_read_unlock(mm);
+ goto out_nolock;
+ }
+
+ /*
+ * __collapse_huge_page_swapin always returns with mmap_lock locked.
+ * If it fails, we release mmap_lock and jump out_nolock.
+ * Continuing to collapse causes inconsistency.
+ */
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced)) {
+ mmap_read_unlock(mm);
+ goto out_nolock;
+ }
+
+ mmap_read_unlock(mm);
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later handled by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ mmap_write_lock(mm);
+ result = hugepage_vma_revalidate(mm, address, &vma);
+ if (result)
+ goto out;
+ /* check if the pmd is still valid */
+ if (mm_find_pmd(mm, address) != pmd)
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
+ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ /*
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
+ */
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
+
+ spin_lock(pte_ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist);
+ spin_unlock(pte_ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ /*
+ * We can only use set_pmd_at when establishing
+ * hugepmds and never for establishing regular pmds that
+ * points to regular pagetables. Use pmd_populate for that
+ */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ spin_unlock(pmd_ptl);
+ anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock_write(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache_pmd(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+
+ *hpage = NULL;
+
+ khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
+out_up_write:
+ mmap_write_unlock(mm);
+out_nolock:
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
+ trace_mm_collapse_huge_page(mm, isolated, result);
+ return;
+out:
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0;
+ struct page *page = NULL;
+ unsigned long _address;
+ spinlock_t *ptl;
+ int node = NUMA_NO_NODE, unmapped = 0;
+ bool writable = false;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
+
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= khugepaged_max_ptes_swap) {
+ /*
+ * Always be strict with uffd-wp
+ * enabled swap entries. Please see
+ * comment below for pte_uffd_wp().
+ */
+ if (pte_swp_uffd_wp(pteval)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
+ continue;
+ } else {
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto out_unmap;
+ }
+ }
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
+ goto out_unmap;
+ }
+ if (pte_uffd_wp(pteval)) {
+ /*
+ * Don't collapse the page if any of the small
+ * PTEs are armed with uffd write protection.
+ * Here we can also mark the new huge pmd as
+ * write protected if any of the small ones is
+ * marked but that could bring uknown
+ * userfault messages that falls outside of
+ * the registered range. So, just be simple.
+ */
+ result = SCAN_PTE_UFFD_WP;
+ goto out_unmap;
+ }
+ if (pte_write(pteval))
+ writable = true;
+
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ goto out_unmap;
+ }
+
+ page = compound_head(page);
+
+ /*
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
+ */
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
+ goto out_unmap;
+ }
+ khugepaged_node_load[node]++;
+ if (!PageLRU(page)) {
+ result = SCAN_PAGE_LRU;
+ goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
+ /*
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * Here the check is racy it may see totmal_mapcount > refcount
+ * in some cases.
+ * For example, one process with one forked child process.
+ * The parent has the PMD split due to MADV_DONTNEED, then
+ * the child is trying unmap the whole PMD, but khugepaged
+ * may be scanning the parent between the child has
+ * PageDoubleMap flag cleared and dec the mapcount. So
+ * khugepaged may see total_mapcount > refcount.
+ *
+ * But such case is ephemeral we could always retry collapse
+ * later. However it may report false positive if the page
+ * has excessive GUP pins (i.e. 512). Anyway the same check
+ * will be done again later the risk seems low.
+ */
+ if (!is_refcount_suitable(page)) {
+ result = SCAN_PAGE_COUNT;
+ goto out_unmap;
+ }
+ if (pte_young(pteval) ||
+ page_is_young(page) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced++;
+ }
+ if (!writable) {
+ result = SCAN_PAGE_RO;
+ } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ }
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret) {
+ node = khugepaged_find_target_node();
+ /* collapse_huge_page will return with the mmap_lock released */
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped);
+ }
+out:
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ none_or_zero, result, unmapped);
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ lockdep_assert_held(&khugepaged_mm_lock);
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+#ifdef CONFIG_SHMEM
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct mm_slot *mm_slot;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct page *hpage;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, _pmd;
+ spinlock_t *ptl;
+ int count = 0;
+ int i;
+ struct mmu_notifier_range range;
+
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ return;
+
+ /*
+ * This vm_flags may not have VM_HUGEPAGE if the page was not
+ * collapsed by this mm. But we can still collapse if the page is
+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+ * will not fail the vma for missing VM_HUGEPAGE
+ */
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ return;
+
+ hpage = find_lock_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, haddr));
+ if (!hpage)
+ return;
+
+ if (!PageHead(hpage))
+ goto drop_hpage;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd)
+ goto drop_hpage;
+
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+ /* step 1: check all mapped PTEs are to the right huge page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ /* empty pte, skip */
+ if (pte_none(*pte))
+ continue;
+
+ /* page swapped out, abort */
+ if (!pte_present(*pte))
+ goto abort;
+
+ page = vm_normal_page(vma, addr, *pte);
+
+ /*
+ * Note that uprobe, debugger, or MAP_PRIVATE may change the
+ * page table, but the new page will not be a subpage of hpage.
+ */
+ if (hpage + i != page)
+ goto abort;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ pte_unmap_unlock(start_pte, ptl);
+
+ /* step 3: set proper refcount and mm_counters. */
+ if (count) {
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ }
+
+ /* step 4: collapse pmd */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
+ mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
+ pte_free(mm, pmd_pgtable(_pmd));
+
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+drop_hpage:
+ unlock_page(hpage);
+ put_page(hpage);
+ return;
+
+abort:
+ pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ goto drop_hpage;
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+ int i;
+
+ if (likely(mm_slot->nr_pte_mapped_thp == 0))
+ return 0;
+
+ if (!mmap_write_trylock(mm))
+ return -EBUSY;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+ mm_slot->nr_pte_mapped_thp = 0;
+ mmap_write_unlock(mm);
+ return 0;
+}
+
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ unsigned long addr;
+ pmd_t *pmd, _pmd;
+
+ i_mmap_lock_write(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth investing
+ * mmap_write_lock(mm) as PMD-mapping is likely to be split
+ * later.
+ *
+ * Not that vma->anon_vma check is racy: it can be set up after
+ * the check but before we took mmap_lock by the fault path.
+ * But page lock would prevent establishing any new ptes of the
+ * page, so we are safe.
+ *
+ * An alternative would be drop the check, but check that page
+ * table is clear before calling pmdp_collapse_flush() under
+ * ptl. It has higher chance to recover THP for the VMA, but
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
+ */
+ if (vma->anon_vma)
+ continue;
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (addr & ~HPAGE_PMD_MASK)
+ continue;
+ if (vma->vm_end < addr + HPAGE_PMD_SIZE)
+ continue;
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ continue;
+ /*
+ * We need exclusive mmap_lock to retract page table.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_lock while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
+ */
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ mm_dec_nr_ptes(mm);
+ tlb_remove_table_sync_one();
+ pte_free(mm, pmd_pgtable(_pmd));
+ mmu_notifier_invalidate_range_end(&range);
+ }
+ mmap_write_unlock(mm);
+ } else {
+ /* Try again later */
+ khugepaged_add_pte_mapped_thp(mm, addr);
+ }
+ }
+ i_mmap_unlock_write(mapping);
+}
+
+/**
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
+ *
+ * Basic scheme is simple, details are more complex:
+ * - allocate and lock a new huge page;
+ * - scan page cache replacing old pages with the new one
+ * + swap/gup in pages if necessary;
+ * + fill in gaps;
+ * + keep old pages around in case rollback is required;
+ * - if replacing succeeds:
+ * + copy data over;
+ * + free old pages;
+ * + unlock huge page;
+ * - if replacing failed;
+ * + put all pages back and unfreeze them;
+ * + restore gaps in the page cache;
+ * + unlock and free huge page;
+ */
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
+ struct page **hpage, int node)
+{
+ struct address_space *mapping = file->f_mapping;
+ gfp_t gfp;
+ struct page *new_page;
+ pgoff_t index, end = start + HPAGE_PMD_NR;
+ LIST_HEAD(pagelist);
+ XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
+ int nr_none = 0, result = SCAN_SUCCEED;
+ bool is_shmem = shmem_file(file);
+
+ VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
+ VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
+
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+
+ new_page = khugepaged_alloc_page(hpage, gfp, node);
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out;
+ }
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
+
+ /* This will be less messy when we use multi-index entries */
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock_irq(&xas);
+ if (!xas_nomem(&xas, GFP_KERNEL)) {
+ result = SCAN_FAIL;
+ goto out;
+ }
+ } while (1);
+
+ __SetPageLocked(new_page);
+ if (is_shmem)
+ __SetPageSwapBacked(new_page);
+ new_page->index = start;
+ new_page->mapping = mapping;
+
+ /*
+ * At this point the new_page is locked and not up-to-date.
+ * It's safe to insert it into the page cache, because nobody would
+ * be able to map it or use it in another way until we unlock it.
+ */
+
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ struct page *page = xas_next(&xas);
+
+ VM_BUG_ON(index != xas.xa_index);
+ if (is_shmem) {
+ if (!page) {
+ /*
+ * Stop if extent has been truncated or
+ * hole-punched, and is now completely
+ * empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
+ goto xa_locked;
+ }
+ xas_store(&xas, new_page);
+ nr_none++;
+ continue;
+ }
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
+ }
+ } else { /* !is_shmem */
+ if (!page || xa_is_value(page)) {
+ xas_unlock_irq(&xas);
+ page_cache_sync_readahead(mapping, &file->f_ra,
+ file, index,
+ end - index);
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
+ page = find_lock_page(mapping, index);
+ if (unlikely(page == NULL)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd,
+ * so this page is dirty because it hasn't
+ * been flushed since first write. There
+ * won't be new dirty pages.
+ *
+ * Trigger async flush here and hope the
+ * writeback is done when khugepaged
+ * revisits this page.
+ *
+ * This is a one-off situation. We are not
+ * forcing writeback in loop.
+ */
+ xas_unlock_irq(&xas);
+ filemap_flush(mapping);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ } else if (PageWriteback(page)) {
+ xas_unlock_irq(&xas);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
+ }
+ }
+
+ /*
+ * The page must be locked, so we can drop the i_pages lock
+ * without racing with truncate.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ /* make sure the page is up to date */
+ if (unlikely(!PageUptodate(page))) {
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
+ /*
+ * If file was truncated then extended, or hole-punched, before
+ * we locked the first page, then a THP might be there already.
+ */
+ if (PageTransCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unlock;
+ }
+
+ if (page_mapping(page) != mapping) {
+ result = SCAN_TRUNCATED;
+ goto out_unlock;
+ }
+
+ if (!is_shmem && (PageDirty(page) ||
+ PageWriteback(page))) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+ * since first write.
+ */
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
+ if (isolate_lru_page(page)) {
+ result = SCAN_DEL_PAGE_LRU;
+ goto out_unlock;
+ }
+
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ putback_lru_page(page);
+ goto out_unlock;
+ }
+
+ if (page_mapped(page))
+ unmap_mapping_pages(mapping, index, 1, false);
+
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
+
+ VM_BUG_ON_PAGE(page != xas_load(&xas), page);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+
+ /*
+ * The page is expected to have page_count() == 3:
+ * - we hold a pin on it;
+ * - one reference from page cache;
+ * - one from isolate_lru_page;
+ */
+ if (!page_ref_freeze(page, 3)) {
+ result = SCAN_PAGE_COUNT;
+ xas_unlock_irq(&xas);
+ putback_lru_page(page);
+ goto out_unlock;
+ }
+
+ /*
+ * Add the page to the list to be able to undo the collapse if
+ * something go wrong.
+ */
+ list_add_tail(&page->lru, &pagelist);
+
+ /* Finally, replace with the new page. */
+ xas_store(&xas, new_page);
+ continue;
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+ goto xa_unlocked;
+ }
+
+ if (is_shmem)
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ else {
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+ filemap_nr_thps_inc(mapping);
+ }
+
+ if (nr_none) {
+ __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+ if (is_shmem)
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ }
+
+xa_locked:
+ xas_unlock_irq(&xas);
+xa_unlocked:
+
+ if (result == SCAN_SUCCEED) {
+ struct page *page, *tmp;
+
+ /*
+ * Replacing old pages with new one has succeeded, now we
+ * need to copy the content and free the old pages.
+ */
+ index = start;
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ while (index < page->index) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+ copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
+ page);
+ list_del(&page->lru);
+ page->mapping = NULL;
+ page_ref_unfreeze(page, 1);
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ unlock_page(page);
+ put_page(page);
+ index++;
+ }
+ while (index < end) {
+ clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ index++;
+ }
+
+ SetPageUptodate(new_page);
+ page_ref_add(new_page, HPAGE_PMD_NR - 1);
+ if (is_shmem)
+ set_page_dirty(new_page);
+ lru_cache_add(new_page);
+
+ /*
+ * Remove pte page tables, so we can re-fault the page as huge.
+ */
+ retract_page_tables(mapping, start);
+ *hpage = NULL;
+
+ khugepaged_pages_collapsed++;
+ } else {
+ struct page *page;
+
+ /* Something went wrong: roll back page cache changes */
+ xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+
+ if (is_shmem)
+ shmem_uncharge(mapping->host, nr_none);
+
+ xas_set(&xas, start);
+ xas_for_each(&xas, page, end - 1) {
+ page = list_first_entry_or_null(&pagelist,
+ struct page, lru);
+ if (!page || xas.xa_index < page->index) {
+ if (!nr_none)
+ break;
+ nr_none--;
+ /* Put holes back where they were */
+ xas_store(&xas, NULL);
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
+
+ /* Unfreeze the page. */
+ list_del(&page->lru);
+ page_ref_unfreeze(page, 2);
+ xas_store(&xas, page);
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ unlock_page(page);
+ putback_lru_page(page);
+ xas_lock_irq(&xas);
+ }
+ VM_BUG_ON(nr_none);
+ xas_unlock_irq(&xas);
+
+ new_page->mapping = NULL;
+ }
+
+ unlock_page(new_page);
+out:
+ VM_BUG_ON(!list_empty(&pagelist));
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
+ /* TODO: tracepoints */
+}
+
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
+{
+ struct page *page = NULL;
+ struct address_space *mapping = file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, start);
+ int present, swap;
+ int node = NUMA_NO_NODE;
+ int result = SCAN_SUCCEED;
+
+ present = 0;
+ swap = 0;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ rcu_read_lock();
+ xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ if (xa_is_value(page)) {
+ if (++swap > khugepaged_max_ptes_swap) {
+ result = SCAN_EXCEED_SWAP_PTE;
+ break;
+ }
+ continue;
+ }
+
+ if (PageTransCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ break;
+ }
+
+ node = page_to_nid(page);
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
+ break;
+ }
+ khugepaged_node_load[node]++;
+
+ if (!PageLRU(page)) {
+ result = SCAN_PAGE_LRU;
+ break;
+ }
+
+ if (page_count(page) !=
+ 1 + page_mapcount(page) + page_has_private(page)) {
+ result = SCAN_PAGE_COUNT;
+ break;
+ }
+
+ /*
+ * We probably should check if the page is referenced here, but
+ * nobody would transfer pte_young() to PageReferenced() for us.
+ * And rmap walk here is just too costly...
+ */
+
+ present++;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+
+ if (result == SCAN_SUCCEED) {
+ if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ result = SCAN_EXCEED_NONE_PTE;
+ } else {
+ node = khugepaged_find_target_node();
+ collapse_file(mm, file, start, hpage, node);
+ }
+ }
+
+ /* TODO: tracepoints */
+}
+#else
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
+{
+ BUILD_BUG();
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ return 0;
+}
+#endif
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ lockdep_assert_held(&khugepaged_mm_lock);
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+ khugepaged_collapse_pte_mapped_thps(mm_slot);
+
+ mm = mm_slot->mm;
+ /*
+ * Don't wait for semaphore (to avoid long wait times). Just move to
+ * the next mm on the list.
+ */
+ vma = NULL;
+ if (unlikely(!mmap_read_trylock(mm)))
+ goto breakouterloop_mmap_lock;
+ if (likely(!khugepaged_test_exit(mm)))
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+ if (!hugepage_vma_check(vma, vma->vm_flags)) {
+skip:
+ progress++;
+ continue;
+ }
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
+ goto skip;
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma,
+ khugepaged_scan.address);
+
+ mmap_read_unlock(mm);
+ ret = 1;
+ khugepaged_scan_file(mm, file, pgoff, hpage);
+ fput(file);
+ } else {
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ }
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_lock so break loop */
+ goto breakouterloop_mmap_lock;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_lock:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ kthread_should_stop();
+}
+
+static void khugepaged_do_scan(void)
+{
+ struct page *hpage = NULL;
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ lru_add_drain_all();
+
+ while (progress < pages) {
+ if (!khugepaged_prealloc_page(&hpage, &wait))
+ break;
+
+ cond_resched();
+
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ &hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
+}
+
+static bool khugepaged_should_wakeup(void)
+{
+ return kthread_should_stop() ||
+ time_after_eq(jiffies, khugepaged_sleep_expire);
+}
+
+static void khugepaged_wait_work(void)
+{
+ if (khugepaged_has_work()) {
+ const unsigned long scan_sleep_jiffies =
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
+
+ if (!scan_sleep_jiffies)
+ return;
+
+ khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
+ wait_event_freezable_timeout(khugepaged_wait,
+ khugepaged_should_wakeup(),
+ scan_sleep_jiffies);
+ return;
+ }
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, MAX_NICE);
+
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+static void set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+
+ for_each_populated_zone(zone) {
+ /*
+ * We don't need to worry about fragmentation of
+ * ZONE_MOVABLE since it only has movable pages.
+ */
+ if (zone_idx(zone) > gfp_zone(GFP_USER))
+ continue;
+
+ nr_zones++;
+ }
+
+ /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes) {
+ if (user_min_free_kbytes >= 0)
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
+ min_free_kbytes, recommended_min);
+
+ min_free_kbytes = recommended_min;
+ }
+ setup_per_zone_wmarks();
+}
+
+int start_stop_khugepaged(void)
+{
+ int err = 0;
+
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled()) {
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (IS_ERR(khugepaged_thread)) {
+ pr_err("khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ goto fail;
+ }
+
+ if (!list_empty(&khugepaged_scan.mm_head))
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+fail:
+ mutex_unlock(&khugepaged_mutex);
+ return err;
+}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 000000000..4801751cb
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1995 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/dev-tools/kmemleak.rst.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
+ * accesses to the object_tree_root. The object_list is the main list
+ * holding the metadata (struct kmemleak_object) for the allocated memory
+ * blocks. The object_tree_root is a red black tree used to look-up
+ * metadata based on a pointer to the corresponding memory block. The
+ * kmemleak_object structures are added to the object_list and
+ * object_tree_root in the create_object() function called from the
+ * kmemleak_alloc() callback and removed in delete_object() called from the
+ * kmemleak_free() callback
+ * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
+ * Accesses to the metadata (e.g. count) are protected by this lock. Note
+ * that some members of this structure may be protected by other means
+ * (atomic or kmemleak_lock). This lock is also held when scanning the
+ * corresponding memory block to avoid the kernel freeing it via the
+ * kmemleak_free() callback. This is less heavyweight than holding a global
+ * lock like kmemleak_lock during scanning.
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ * unreferenced objects at a time. The gray_list contains the objects which
+ * are already referenced or marked as false positives and need to be
+ * scanned. This list is only modified during a scanning episode when the
+ * scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ * Note that the kmemleak_object.use_count is incremented when an object is
+ * added to the gray_list and therefore cannot be freed. This mutex also
+ * prevents multiple users of the "kmemleak" debugfs file together with
+ * modifications to the memory scanning parameters including the scan_thread
+ * pointer
+ *
+ * Locks and mutexes are acquired/nested in the following order:
+ *
+ * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
+ *
+ * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
+ * regions.
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/memblock.h>
+#include <linux/pfn.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+#include <linux/crc32.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <linux/atomic.h>
+
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
+#include <linux/memory_hotplug.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE 16 /* stack trace length */
+#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
+#define SECS_FIRST_SCAN 60 /* delay before the first scan */
+#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
+#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
+
+#define BYTES_PER_POINTER sizeof(void *)
+
+/* GFP bitmask for kmemleak internal allocations */
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+ __GFP_NORETRY | __GFP_NOMEMALLOC | \
+ __GFP_NOWARN)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+ struct hlist_node node;
+ unsigned long start;
+ size_t size;
+};
+
+#define KMEMLEAK_GREY 0
+#define KMEMLEAK_BLACK -1
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * rb_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+ raw_spinlock_t lock;
+ unsigned int flags; /* object status flags */
+ struct list_head object_list;
+ struct list_head gray_list;
+ struct rb_node rb_node;
+ struct rcu_head rcu; /* object_list lockless traversal */
+ /* object usage count; object freed when use_count == 0 */
+ atomic_t use_count;
+ unsigned long pointer;
+ size_t size;
+ /* pass surplus references to this pointer */
+ unsigned long excess_ref;
+ /* minimum number of a pointers found before it is considered leak */
+ int min_count;
+ /* the total number of pointers found pointing to this object */
+ int count;
+ /* checksum for detecting modified objects */
+ u32 checksum;
+ /* memory ranges to be scanned inside an object (empty for all) */
+ struct hlist_head area_list;
+ unsigned long trace[MAX_TRACE];
+ unsigned int trace_len;
+ unsigned long jiffies; /* creation timestamp */
+ pid_t pid; /* pid of the current task */
+ char comm[TASK_COMM_LEN]; /* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED (1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED (1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN (1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN (1 << 3)
+
+#define HEX_PREFIX " "
+/* number of bytes to print per line; must be 16 or 32 */
+#define HEX_ROW_SIZE 16
+/* number of bytes to print at a time (1, 2, 4, 8) */
+#define HEX_GROUP_SIZE 1
+/* include ASCII after the hex output */
+#define HEX_ASCII 1
+/* max number of lines to be printed */
+#define HEX_MAX_LINES 2
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* protecting the access to object_list and object_tree_root */
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static int kmemleak_enabled = 1;
+/* same as above but only for the kmemleak_free() callback */
+static int kmemleak_free_enabled = 1;
+/* set in the late_initcall if there were no errors */
+static int kmemleak_initialized;
+/* set if a kmemleak warning was issued */
+static int kmemleak_warning;
+/* set if a fatal kmemleak error has occurred */
+static int kmemleak_error;
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+static struct task_struct *scan_thread;
+/* used to avoid reporting of recently allocated objects */
+static unsigned long jiffies_min_age;
+static unsigned long jiffies_last_scan;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan = 1;
+/* protects the memory scanning, parameters and debug/kmemleak file access */
+static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+/* If there are leaks that can be reported */
+static bool kmemleak_found_leaks;
+
+static bool kmemleak_verbose;
+module_param_named(verbose, kmemleak_verbose, bool, 0600);
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...) do { \
+ pr_warn(x); \
+ dump_stack(); \
+ kmemleak_warning = 1; \
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
+ * recovered from. Kmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_stop(x...) do { \
+ kmemleak_warn(x); \
+ kmemleak_disable(); \
+} while (0)
+
+#define warn_or_seq_printf(seq, fmt, ...) do { \
+ if (seq) \
+ seq_printf(seq, fmt, ##__VA_ARGS__); \
+ else \
+ pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
+ int rowsize, int groupsize, const void *buf,
+ size_t len, bool ascii)
+{
+ if (seq)
+ seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+ else
+ print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+}
+
+/*
+ * Printing of the objects hex dump to the seq file. The number of lines to be
+ * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
+ * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
+ * with the object->lock held.
+ */
+static void hex_dump_object(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ const u8 *ptr = (const u8 *)object->pointer;
+ size_t len;
+
+ /* limit the number of lines to HEX_MAX_LINES */
+ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ kasan_disable_current();
+ warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ kasan_enable_current();
+}
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray - not orphan, not marked as false positive (min_count == 0) or
+ * sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ * (min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static bool color_white(const struct kmemleak_object *object)
+{
+ return object->count != KMEMLEAK_BLACK &&
+ object->count < object->min_count;
+}
+
+static bool color_gray(const struct kmemleak_object *object)
+{
+ return object->min_count != KMEMLEAK_BLACK &&
+ object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static bool unreferenced_object(struct kmemleak_object *object)
+{
+ return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
+ time_before_eq(object->jiffies + jiffies_min_age,
+ jiffies_last_scan);
+}
+
+/*
+ * Printing of the unreferenced objects information to the seq file. The
+ * print_unreferenced function must be called with the object->lock held.
+ */
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ int i;
+ unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
+
+ warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
+ hex_dump_object(seq, object);
+ warn_or_seq_printf(seq, " backtrace:\n");
+
+ for (i = 0; i < object->trace_len; i++) {
+ void *ptr = (void *)object->trace[i];
+ warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ }
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+ pr_notice("Object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
+ object->comm, object->pid, object->jiffies);
+ pr_notice(" min_count = %d\n", object->min_count);
+ pr_notice(" count = %d\n", object->count);
+ pr_notice(" flags = 0x%x\n", object->flags);
+ pr_notice(" checksum = %u\n", object->checksum);
+ pr_notice(" backtrace:\n");
+ stack_trace_print(object->trace, object->trace_len, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the object search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ struct rb_node *rb = object_tree_root.rb_node;
+
+ while (rb) {
+ struct kmemleak_object *object =
+ rb_entry(rb, struct kmemleak_object, rb_node);
+ if (ptr < object->pointer)
+ rb = object->rb_node.rb_left;
+ else if (object->pointer + object->size <= ptr)
+ rb = object->rb_node.rb_right;
+ else if (object->pointer == ptr || alias)
+ return object;
+ else {
+ kmemleak_warn("Found object by alias at 0x%08lx\n",
+ ptr);
+ dump_object_info(object);
+ break;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+ return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ /* try the slab allocator first */
+ if (object_cache) {
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (object)
+ return object;
+ }
+
+ /* slab allocation failed, try the memory pool */
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ object = list_first_entry_or_null(&mem_pool_free_list,
+ typeof(*object), object_list);
+ if (object)
+ list_del(&object->object_list);
+ else if (mem_pool_free_count)
+ object = &mem_pool[--mem_pool_free_count];
+ else
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+ kmem_cache_free(object_cache, object);
+ return;
+ }
+
+ /* add the object to the memory pool free list */
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ list_add(&object->object_list, &mem_pool_free_list);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+ struct hlist_node *tmp;
+ struct kmemleak_scan_area *area;
+ struct kmemleak_object *object =
+ container_of(rcu, struct kmemleak_object, rcu);
+
+ /*
+ * Once use_count is 0 (guaranteed by put_object), there is no other
+ * code accessing this object, hence no need for locking.
+ */
+ hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
+ hlist_del(&area->node);
+ kmem_cache_free(scan_area_cache, area);
+ }
+ mem_pool_free(object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+ if (!atomic_dec_and_test(&object->use_count))
+ return;
+
+ /* should only get here after delete_object was called */
+ WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+ /*
+ * It may be too early for the RCU callbacks, however, there is no
+ * concurrent object_list traversal when !object_cache and all objects
+ * came from the memory pool. Free the object directly.
+ */
+ if (object_cache)
+ call_rcu(&object->rcu, free_object_rcu);
+ else
+ free_object_rcu(&object->rcu);
+}
+
+/*
+ * Look up an object in the object search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ rcu_read_lock();
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ object = lookup_object(ptr, alias);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+
+ /* check whether the object is still available */
+ if (object && !get_object(object))
+ object = NULL;
+ rcu_read_unlock();
+
+ return object;
+}
+
+/*
+ * Remove an object from the object_tree_root and object_list. Must be called
+ * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ */
+static void __remove_object(struct kmemleak_object *object)
+{
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+}
+
+/*
+ * Look up an object in the object search tree and remove it from both
+ * object_tree_root and object_list. The returned object's use_count should be
+ * at least 1, as initially set by create_object().
+ */
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ object = lookup_object(ptr, alias);
+ if (object)
+ __remove_object(object);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Save stack trace to the given array of MAX_TRACE size.
+ */
+static int __save_stack_trace(unsigned long *trace)
+{
+ return stack_trace_save(trace, MAX_TRACE, 2);
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object, *parent;
+ struct rb_node **link, *rb_parent;
+ unsigned long untagged_ptr;
+
+ object = mem_pool_alloc(gfp);
+ if (!object) {
+ pr_warn("Cannot allocate a kmemleak_object structure\n");
+ kmemleak_disable();
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&object->object_list);
+ INIT_LIST_HEAD(&object->gray_list);
+ INIT_HLIST_HEAD(&object->area_list);
+ raw_spin_lock_init(&object->lock);
+ atomic_set(&object->use_count, 1);
+ object->flags = OBJECT_ALLOCATED;
+ object->pointer = ptr;
+ object->size = size;
+ object->excess_ref = 0;
+ object->min_count = min_count;
+ object->count = 0; /* white color initially */
+ object->jiffies = jiffies;
+ object->checksum = 0;
+
+ /* task information */
+ if (in_irq()) {
+ object->pid = 0;
+ strncpy(object->comm, "hardirq", sizeof(object->comm));
+ } else if (in_serving_softirq()) {
+ object->pid = 0;
+ strncpy(object->comm, "softirq", sizeof(object->comm));
+ } else {
+ object->pid = current->pid;
+ /*
+ * There is a small chance of a race with set_task_comm(),
+ * however using get_task_comm() here may cause locking
+ * dependency issues with current->alloc_lock. In the worst
+ * case, the command line is not correct.
+ */
+ strncpy(object->comm, current->comm, sizeof(object->comm));
+ }
+
+ /* kernel backtrace */
+ object->trace_len = __save_stack_trace(object->trace);
+
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
+ link = &object_tree_root.rb_node;
+ rb_parent = NULL;
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+ if (ptr + size <= parent->pointer)
+ link = &parent->rb_node.rb_left;
+ else if (parent->pointer + parent->size <= ptr)
+ link = &parent->rb_node.rb_right;
+ else {
+ kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
+ ptr);
+ /*
+ * No need for parent->lock here since "parent" cannot
+ * be freed while the kmemleak_lock is held.
+ */
+ dump_object_info(parent);
+ kmem_cache_free(object_cache, object);
+ object = NULL;
+ goto out;
+ }
+ }
+ rb_link_node(&object->rb_node, rb_parent, link);
+ rb_insert_color(&object->rb_node, &object_tree_root);
+
+ list_add_tail_rcu(&object->object_list, &object_list);
+out:
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+ return object;
+}
+
+/*
+ * Mark the object as not allocated and schedule RCU freeing via put_object().
+ */
+static void __delete_object(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+ WARN_ON(atomic_read(&object->use_count) < 1);
+
+ /*
+ * Locking here also ensures that the corresponding memory block
+ * cannot be freed when it is being scanned.
+ */
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->flags &= ~OBJECT_ALLOCATED;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it.
+ */
+static void delete_object_full(unsigned long ptr)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_remove_object(ptr, 0);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Freeing unknown object at 0x%08lx\n",
+ ptr);
+#endif
+ return;
+ }
+ __delete_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it. If the memory block is partially freed, the function may create
+ * additional metadata for the remaining parts of the block.
+ */
+static void delete_object_part(unsigned long ptr, size_t size)
+{
+ struct kmemleak_object *object;
+ unsigned long start, end;
+
+ object = find_and_remove_object(ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
+#endif
+ return;
+ }
+
+ /*
+ * Create one or two objects that may result from the memory block
+ * split. Note that partial freeing is only done by free_bootmem() and
+ * this happens before kmemleak_init() is called.
+ */
+ start = object->pointer;
+ end = object->pointer + object->size;
+ if (ptr > start)
+ create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL);
+ if (ptr + size < end)
+ create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL);
+
+ __delete_object(object);
+}
+
+static void __paint_it(struct kmemleak_object *object, int color)
+{
+ object->min_count = color;
+ if (color == KMEMLEAK_BLACK)
+ object->flags |= OBJECT_NO_SCAN;
+}
+
+static void paint_it(struct kmemleak_object *object, int color)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ __paint_it(object, color);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+}
+
+static void paint_ptr(unsigned long ptr, int color)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+ ptr,
+ (color == KMEMLEAK_GREY) ? "Grey" :
+ (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
+ return;
+ }
+ paint_it(object, color);
+ put_object(object);
+}
+
+/*
+ * Mark an object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_GREY);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_BLACK);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ struct kmemleak_scan_area *area = NULL;
+ unsigned long untagged_ptr;
+ unsigned long untagged_objp;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+ kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if (!area) {
+ pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+ /* mark the object for full scan to avoid false positives */
+ object->flags |= OBJECT_FULL_SCAN;
+ goto out_unlock;
+ }
+ if (size == SIZE_MAX) {
+ size = untagged_objp + object->size - untagged_ptr;
+ } else if (untagged_ptr + size > untagged_objp + object->size) {
+ kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+ dump_object_info(object);
+ kmem_cache_free(scan_area_cache, area);
+ goto out_unlock;
+ }
+
+ INIT_HLIST_NODE(&area->node);
+ area->start = ptr;
+ area->size = size;
+
+ hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Any surplus references (object already gray) to 'ptr' are passed to
+ * 'excess_ref'. This is used in the vmalloc() case where a pointer to
+ * vm_struct may be used as an alternative reference to the vmalloc'ed object
+ * (see free_thread_stack()).
+ */
+static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->excess_ref = excess_ref;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->flags |= OBJECT_NO_SCAN;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.).
+ */
+void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ create_object((unsigned long)ptr, size, min_count, gfp);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/**
+ * kmemleak_alloc_percpu - register a newly allocated __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ * @size: size of the object
+ * @gfp: flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel percpu allocator when a new object
+ * (memory block) is allocated (alloc_percpu).
+ */
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+ gfp_t gfp)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
+
+ /*
+ * Percpu allocations are only scanned and not reported as leaks
+ * (min_count is set to 0).
+ */
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ create_object((unsigned long)per_cpu_ptr(ptr, cpu),
+ size, 0, gfp);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
+
+/**
+ * kmemleak_vmalloc - register a newly vmalloc'ed object
+ * @area: pointer to vm_struct
+ * @size: size of the object
+ * @gfp: __vmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the vmalloc() kernel allocator when a new
+ * object (memory block) is allocated.
+ */
+void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu)\n", __func__, area, size);
+
+ /*
+ * A min_count = 2 is needed because vm_struct contains a reference to
+ * the virtual address of the vmalloc'ed block.
+ */
+ if (kmemleak_enabled) {
+ create_object((unsigned long)area->addr, size, 2, gfp);
+ object_set_excess_ref((unsigned long)area,
+ (unsigned long)area->addr);
+ }
+}
+EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
+
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void __ref kmemleak_free(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
+ delete_object_full((unsigned long)ptr);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
+ */
+void __ref kmemleak_free_part(const void *ptr, size_t size)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ delete_object_part((unsigned long)ptr, size);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_part);
+
+/**
+ * kmemleak_free_percpu - unregister a previously registered __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ *
+ * This function is called from the kernel percpu allocator when an object
+ * (memory block) is freed (free_percpu).
+ */
+void __ref kmemleak_free_percpu(const void __percpu *ptr)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ delete_object_full((unsigned long)per_cpu_ptr(ptr,
+ cpu));
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
+
+/**
+ * kmemleak_update_trace - update object allocation stack trace
+ * @ptr: pointer to beginning of the object
+ *
+ * Override the object allocation stack trace for cases where the actual
+ * allocation place is not always useful.
+ */
+void __ref kmemleak_update_trace(const void *ptr)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
+ return;
+
+ object = find_and_get_object((unsigned long)ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Updating stack trace for unknown object at %p\n",
+ ptr);
+#endif
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->trace_len = __save_stack_trace(object->trace);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+}
+EXPORT_SYMBOL(kmemleak_update_trace);
+
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
+ */
+void __ref kmemleak_not_leak(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ make_gray_object((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
+ */
+void __ref kmemleak_ignore(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ make_black_object((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
+ */
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
+ add_scan_area((unsigned long)ptr, size, gfp);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
+ */
+void __ref kmemleak_no_scan(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ object_no_scan((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/**
+ * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical
+ * address argument
+ * @phys: physical address of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object.
+ * See kmemleak_alloc()
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ */
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+ gfp_t gfp)
+{
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ kmemleak_alloc(__va(phys), size, min_count, gfp);
+}
+EXPORT_SYMBOL(kmemleak_alloc_phys);
+
+/**
+ * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a
+ * physical address argument
+ * @phys: physical address if the beginning or inside an object. This
+ * also represents the start of the range to be freed
+ * @size: size to be unregistered
+ */
+void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
+{
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ kmemleak_free_part(__va(phys), size);
+}
+EXPORT_SYMBOL(kmemleak_free_part_phys);
+
+/**
+ * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
+ * address argument
+ * @phys: physical address of the object
+ */
+void __ref kmemleak_not_leak_phys(phys_addr_t phys)
+{
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ kmemleak_not_leak(__va(phys));
+}
+EXPORT_SYMBOL(kmemleak_not_leak_phys);
+
+/**
+ * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
+ * address argument
+ * @phys: physical address of the object
+ */
+void __ref kmemleak_ignore_phys(phys_addr_t phys)
+{
+ if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn)
+ kmemleak_ignore(__va(phys));
+}
+EXPORT_SYMBOL(kmemleak_ignore_phys);
+
+/*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+ u32 old_csum = object->checksum;
+
+ kasan_disable_current();
+ kcsan_disable_current();
+ object->checksum = crc32(0, (void *)object->pointer, object->size);
+ kasan_enable_current();
+ kcsan_enable_current();
+
+ return object->checksum != old_csum;
+}
+
+/*
+ * Update an object's references. object->lock must be held by the caller.
+ */
+static void update_refs(struct kmemleak_object *object)
+{
+ if (!color_white(object)) {
+ /* non-orphan, ignored or new */
+ return;
+ }
+
+ /*
+ * Increase the object's reference count (number of pointers to the
+ * memory block). If this count reaches the required minimum, the
+ * object's color will become gray and it will be added to the
+ * gray_list.
+ */
+ object->count++;
+ if (color_gray(object)) {
+ /* put_object() called when removing from gray_list */
+ WARN_ON(!get_object(object));
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occurred.
+ */
+static int scan_should_stop(void)
+{
+ if (!kmemleak_enabled)
+ return 1;
+
+ /*
+ * This function may be called from either process or kthread context,
+ * hence the need to check for both stop conditions.
+ */
+ if (current->mm)
+ return signal_pending(current);
+ else
+ return kthread_should_stop();
+
+ return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+ struct kmemleak_object *scanned)
+{
+ unsigned long *ptr;
+ unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+ unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+ unsigned long flags;
+ unsigned long untagged_ptr;
+
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
+ for (ptr = start; ptr < end; ptr++) {
+ struct kmemleak_object *object;
+ unsigned long pointer;
+ unsigned long excess_ref;
+
+ if (scan_should_stop())
+ break;
+
+ kasan_disable_current();
+ pointer = *ptr;
+ kasan_enable_current();
+
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+ if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
+ continue;
+
+ /*
+ * No need for get_object() here since we hold kmemleak_lock.
+ * object->use_count cannot be dropped to 0 while the object
+ * is still present in object_tree_root and object_list
+ * (with updates protected by kmemleak_lock).
+ */
+ object = lookup_object(pointer, 1);
+ if (!object)
+ continue;
+ if (object == scanned)
+ /* self referenced, ignore */
+ continue;
+
+ /*
+ * Avoid the lockdep recursive warning on object->lock being
+ * previously acquired in scan_object(). These locks are
+ * enclosed by scan_mutex.
+ */
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ /* only pass surplus references (object already gray) */
+ if (color_gray(object)) {
+ excess_ref = object->excess_ref;
+ /* no need for update_refs() if object already gray */
+ } else {
+ excess_ref = 0;
+ update_refs(object);
+ }
+ raw_spin_unlock(&object->lock);
+
+ if (excess_ref) {
+ object = lookup_object(excess_ref, 0);
+ if (!object)
+ continue;
+ if (object == scanned)
+ /* circular reference, ignore */
+ continue;
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ update_refs(object);
+ raw_spin_unlock(&object->lock);
+ }
+ }
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
+ * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
+ */
+#ifdef CONFIG_SMP
+static void scan_large_block(void *start, void *end)
+{
+ void *next;
+
+ while (start < end) {
+ next = min(start + MAX_SCAN_SIZE, end);
+ scan_block(start, next, NULL);
+ start = next;
+ cond_resched();
+ }
+}
+#endif
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+ struct kmemleak_scan_area *area;
+ unsigned long flags;
+
+ /*
+ * Once the object->lock is acquired, the corresponding memory block
+ * cannot be freed (the same lock is acquired in delete_object).
+ */
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if (object->flags & OBJECT_NO_SCAN)
+ goto out;
+ if (!(object->flags & OBJECT_ALLOCATED))
+ /* already freed object */
+ goto out;
+ if (hlist_empty(&object->area_list) ||
+ object->flags & OBJECT_FULL_SCAN) {
+ void *start = (void *)object->pointer;
+ void *end = (void *)(object->pointer + object->size);
+ void *next;
+
+ do {
+ next = min(start + MAX_SCAN_SIZE, end);
+ scan_block(start, next, object);
+
+ start = next;
+ if (start >= end)
+ break;
+
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ cond_resched();
+ raw_spin_lock_irqsave(&object->lock, flags);
+ } while (object->flags & OBJECT_ALLOCATED);
+ } else
+ hlist_for_each_entry(area, &object->area_list, node)
+ scan_block((void *)area->start,
+ (void *)(area->start + area->size),
+ object);
+out:
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+ struct kmemleak_object *object, *tmp;
+
+ /*
+ * The list traversal is safe for both tail additions and removals
+ * from inside the loop. The kmemleak objects cannot be freed from
+ * outside the loop because their use_count was incremented.
+ */
+ object = list_entry(gray_list.next, typeof(*object), gray_list);
+ while (&object->gray_list != &gray_list) {
+ cond_resched();
+
+ /* may add new objects to the list */
+ if (!scan_should_stop())
+ scan_object(object);
+
+ tmp = list_entry(object->gray_list.next, typeof(*object),
+ gray_list);
+
+ /* remove the object from the list and release it */
+ list_del(&object->gray_list);
+ put_object(object);
+
+ object = tmp;
+ }
+ WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ struct zone *zone;
+ int __maybe_unused i;
+ int new_leaks = 0;
+
+ jiffies_last_scan = jiffies;
+
+ /* prepare the kmemleak_object's */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ raw_spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+ /*
+ * With a few exceptions there should be a maximum of
+ * 1 reference to any object at this point.
+ */
+ if (atomic_read(&object->use_count) > 1) {
+ pr_debug("object->use_count = %d\n",
+ atomic_read(&object->use_count));
+ dump_object_info(object);
+ }
+#endif
+ /* reset the reference count (whiten the object) */
+ object->count = 0;
+ if (color_gray(object) && get_object(object))
+ list_add_tail(&object->gray_list, &gray_list);
+
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+#ifdef CONFIG_SMP
+ /* per-cpu sections scanning */
+ for_each_possible_cpu(i)
+ scan_large_block(__per_cpu_start + per_cpu_offset(i),
+ __per_cpu_end + per_cpu_offset(i));
+#endif
+
+ /*
+ * Struct page scanning for each node.
+ */
+ get_online_mems();
+ for_each_populated_zone(zone) {
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_online_page(pfn);
+
+ if (!page)
+ continue;
+
+ /* only scan pages belonging to this zone */
+ if (page_zone(page) != zone)
+ continue;
+ /* only scan if page is in use */
+ if (page_count(page) == 0)
+ continue;
+ scan_block(page, page + 1, NULL);
+ if (!(pfn & 63))
+ cond_resched();
+ }
+ }
+ put_online_mems();
+
+ /*
+ * Scanning the task stacks (may introduce false negatives).
+ */
+ if (kmemleak_stack_scan) {
+ struct task_struct *p, *g;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ void *stack = try_get_task_stack(p);
+ if (stack) {
+ scan_block(stack, stack + THREAD_SIZE, NULL);
+ put_task_stack(p);
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ /*
+ * Scan the objects already referenced from the sections scanned
+ * above.
+ */
+ scan_gray_list();
+
+ /*
+ * Check for new or unreferenced objects modified since the previous
+ * scan and color them gray until the next scan.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
+ && update_checksum(object) && get_object(object)) {
+ /* color it gray temporarily */
+ object->count = object->min_count;
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ /*
+ * Re-scan the gray list for modified unreferenced objects.
+ */
+ scan_gray_list();
+
+ /*
+ * If scanning was stopped do not report any new unreferenced objects.
+ */
+ if (scan_should_stop())
+ return;
+
+ /*
+ * Scanning result reporting.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if (unreferenced_object(object) &&
+ !(object->flags & OBJECT_REPORTED)) {
+ object->flags |= OBJECT_REPORTED;
+
+ if (kmemleak_verbose)
+ print_unreferenced(NULL, object);
+
+ new_leaks++;
+ }
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ if (new_leaks) {
+ kmemleak_found_leaks = true;
+
+ pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+ new_leaks);
+ }
+
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+ static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
+
+ pr_info("Automatic memory scanning thread started\n");
+ set_user_nice(current, 10);
+
+ /*
+ * Wait before the first scan to allow the system to fully initialize.
+ */
+ if (first_run) {
+ signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
+ first_run = 0;
+ while (timeout && !kthread_should_stop())
+ timeout = schedule_timeout_interruptible(timeout);
+ }
+
+ while (!kthread_should_stop()) {
+ signed long timeout = jiffies_scan_wait;
+
+ mutex_lock(&scan_mutex);
+ kmemleak_scan();
+ mutex_unlock(&scan_mutex);
+
+ /* wait before the next scan */
+ while (timeout && !kthread_should_stop())
+ timeout = schedule_timeout_interruptible(timeout);
+ }
+
+ pr_info("Automatic memory scanning thread ended\n");
+
+ return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the scan_mutex held.
+ */
+static void start_scan_thread(void)
+{
+ if (scan_thread)
+ return;
+ scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+ if (IS_ERR(scan_thread)) {
+ pr_warn("Failed to create the scan thread\n");
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Stop the automatic memory scanning thread.
+ */
+static void stop_scan_thread(void)
+{
+ if (scan_thread) {
+ kthread_stop(scan_thread);
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct kmemleak_object *object;
+ loff_t n = *pos;
+ int err;
+
+ err = mutex_lock_interruptible(&scan_mutex);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ if (n-- > 0)
+ continue;
+ if (get_object(object))
+ goto out;
+ }
+ object = NULL;
+out:
+ return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct kmemleak_object *prev_obj = v;
+ struct kmemleak_object *next_obj = NULL;
+ struct kmemleak_object *obj = prev_obj;
+
+ ++(*pos);
+
+ list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
+ if (get_object(obj)) {
+ next_obj = obj;
+ break;
+ }
+ }
+
+ put_object(prev_obj);
+ return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!IS_ERR(v)) {
+ /*
+ * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+ * waiting was interrupted, so only release it if !IS_ERR.
+ */
+ rcu_read_unlock();
+ mutex_unlock(&scan_mutex);
+ if (v)
+ put_object(v);
+ }
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+ struct kmemleak_object *object = v;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
+ print_unreferenced(seq, object);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+ .start = kmemleak_seq_start,
+ .next = kmemleak_seq_next,
+ .stop = kmemleak_seq_stop,
+ .show = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &kmemleak_seq_ops);
+}
+
+static int dump_str_object_info(const char *str)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ unsigned long addr;
+
+ if (kstrtoul(str, 0, &addr))
+ return -EINVAL;
+ object = find_and_get_object(addr, 0);
+ if (!object) {
+ pr_info("Unknown object at 0x%08lx\n", addr);
+ return -EINVAL;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+ return 0;
+}
+
+/*
+ * We use grey instead of black to ensure we can do future scans on the same
+ * objects. If we did not do future scans these black objects could
+ * potentially contain references to newly allocated objects in the future and
+ * we'd end up with false positives.
+ */
+static void kmemleak_clear(void)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ raw_spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) &&
+ unreferenced_object(object))
+ __paint_it(object, KMEMLEAK_GREY);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ kmemleak_found_leaks = false;
+}
+
+static void __kmemleak_do_cleanup(void);
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ * off - disable kmemleak (irreversible)
+ * stack=on - enable the task stacks scanning
+ * stack=off - disable the tasks stacks scanning
+ * scan=on - start the automatic memory scanning thread
+ * scan=off - stop the automatic memory scanning thread
+ * scan=... - set the automatic memory scanning period in seconds (0 to
+ * disable it)
+ * scan - trigger a memory scan
+ * clear - mark all current reported unreferenced kmemleak objects as
+ * grey to ignore printing them, or free all kmemleak objects
+ * if kmemleak has been disabled.
+ * dump=... - dump information about the object found at the given address
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+ size_t size, loff_t *ppos)
+{
+ char buf[64];
+ int buf_size;
+ int ret;
+
+ buf_size = min(size, (sizeof(buf) - 1));
+ if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+ return -EFAULT;
+ buf[buf_size] = 0;
+
+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (strncmp(buf, "clear", 5) == 0) {
+ if (kmemleak_enabled)
+ kmemleak_clear();
+ else
+ __kmemleak_do_cleanup();
+ goto out;
+ }
+
+ if (!kmemleak_enabled) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (strncmp(buf, "off", 3) == 0)
+ kmemleak_disable();
+ else if (strncmp(buf, "stack=on", 8) == 0)
+ kmemleak_stack_scan = 1;
+ else if (strncmp(buf, "stack=off", 9) == 0)
+ kmemleak_stack_scan = 0;
+ else if (strncmp(buf, "scan=on", 7) == 0)
+ start_scan_thread();
+ else if (strncmp(buf, "scan=off", 8) == 0)
+ stop_scan_thread();
+ else if (strncmp(buf, "scan=", 5) == 0) {
+ unsigned long secs;
+
+ ret = kstrtoul(buf + 5, 0, &secs);
+ if (ret < 0)
+ goto out;
+ stop_scan_thread();
+ if (secs) {
+ jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+ start_scan_thread();
+ }
+ } else if (strncmp(buf, "scan", 4) == 0)
+ kmemleak_scan();
+ else if (strncmp(buf, "dump=", 5) == 0)
+ ret = dump_str_object_info(buf + 5);
+ else
+ ret = -EINVAL;
+
+out:
+ mutex_unlock(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ /* ignore the rest of the buffer, only one command at a time */
+ *ppos += size;
+ return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+ .owner = THIS_MODULE,
+ .open = kmemleak_open,
+ .read = seq_read,
+ .write = kmemleak_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void __kmemleak_do_cleanup(void)
+{
+ struct kmemleak_object *object, *tmp;
+
+ /*
+ * Kmemleak has already been disabled, no need for RCU list traversal
+ * or kmemleak_lock held.
+ */
+ list_for_each_entry_safe(object, tmp, &object_list, object_list) {
+ __remove_object(object);
+ __delete_object(object);
+ }
+}
+
+/*
+ * Stop the memory scanning thread and free the kmemleak internal objects if
+ * no previous scan thread (otherwise, kmemleak may still have some useful
+ * information on memory leaks).
+ */
+static void kmemleak_do_cleanup(struct work_struct *work)
+{
+ stop_scan_thread();
+
+ mutex_lock(&scan_mutex);
+ /*
+ * Once it is made sure that kmemleak_scan has stopped, it is safe to no
+ * longer track object freeing. Ordering of the scan thread stopping and
+ * the memory accesses below is guaranteed by the kthread_stop()
+ * function.
+ */
+ kmemleak_free_enabled = 0;
+ mutex_unlock(&scan_mutex);
+
+ if (!kmemleak_found_leaks)
+ __kmemleak_do_cleanup();
+ else
+ pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
+}
+
+static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+ /* atomically check whether it was already invoked */
+ if (cmpxchg(&kmemleak_error, 0, 1))
+ return;
+
+ /* stop any memory operation tracing */
+ kmemleak_enabled = 0;
+
+ /* check whether it is too early for a kernel thread */
+ if (kmemleak_initialized)
+ schedule_work(&cleanup_work);
+ else
+ kmemleak_free_enabled = 0;
+
+ pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int __init kmemleak_boot_config(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (strcmp(str, "off") == 0)
+ kmemleak_disable();
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+/*
+ * Kmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_disable();
+ return;
+ }
+#endif
+
+ if (kmemleak_error)
+ return;
+
+ jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+ jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+ object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+ scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+
+ /* register the data/bss sections */
+ create_object((unsigned long)_sdata, _edata - _sdata,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ /* only register .data..ro_after_init if not within .data */
+ if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata)
+ create_object((unsigned long)__start_ro_after_init,
+ __end_ro_after_init - __start_ro_after_init,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+ kmemleak_initialized = 1;
+
+ debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops);
+
+ if (kmemleak_error) {
+ /*
+ * Some error occurred and kmemleak was disabled. There is a
+ * small chance that kmemleak_disable() was called immediately
+ * after setting kmemleak_initialized and we may end up with
+ * two clean-up threads but serialized by scan_mutex.
+ */
+ schedule_work(&cleanup_work);
+ return -ENOMEM;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
+ mutex_lock(&scan_mutex);
+ start_scan_thread();
+ mutex_unlock(&scan_mutex);
+ }
+
+ pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
+ mem_pool_free_count);
+
+ return 0;
+}
+late_initcall(kmemleak_late_init);
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000..25b8362a4
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,3205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ * Hugh Dickins
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/xxhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#ifdef CONFIG_NUMA
+#define NUMA(x) (x)
+#define DO_NUMA(x) do { (x); } while (0)
+#else
+#define NUMA(x) (0)
+#define DO_NUMA(x) do { } while (0)
+#endif
+
+/**
+ * DOC: Overview
+ *
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents. Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * The stable tree node includes information required for reverse
+ * mapping from a KSM page to virtual addresses that map this page.
+ *
+ * In order to avoid large latencies of the rmap walks on KSM pages,
+ * KSM maintains two types of nodes in the stable tree:
+ *
+ * * the regular nodes that keep the reverse mapping structures in a
+ * linked list
+ * * the "chains" that link nodes ("dups") that represent the same
+ * write protected memory content, but each "dup" corresponds to a
+ * different KSM page copy of that content
+ *
+ * Internally, the regular nodes, "dups" and "chains" are represented
+ * using the same struct stable_node structure.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time". The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ * memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ * has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ * colors of the nodes and not on their contents, assuring that even when
+ * the tree gets "corrupted" it won't get out of balance, so scanning time
+ * remains the same (also, searching and inserting nodes in an rbtree uses
+ * the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ * take 10 attempts to find a page in the unstable tree, once it is found,
+ * it is secured in the stable tree. (When we scan a new page, we first
+ * compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
+ */
+
+/**
+ * struct mm_slot - ksm information per mm that is being scanned
+ * @link: link to the mm_slots hash list
+ * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node link;
+ struct list_head mm_list;
+ struct rmap_item *rmap_list;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct ksm_scan - cursor for scanning
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ * @rmap_list: link to the next rmap to be scanned in the rmap_list
+ * @seqnr: count of completed full scans (needed when removing unstable node)
+ *
+ * There is only the one ksm_scan instance of this cursor structure.
+ */
+struct ksm_scan {
+ struct mm_slot *mm_slot;
+ unsigned long address;
+ struct rmap_item **rmap_list;
+ unsigned long seqnr;
+};
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @chain_prune_time: time of the last full garbage collection
+ * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
+ */
+struct stable_node {
+ union {
+ struct rb_node node; /* when node of stable tree */
+ struct { /* when listed for migration */
+ struct list_head *head;
+ struct {
+ struct hlist_node hlist_dup;
+ struct list_head list;
+ };
+ };
+ };
+ struct hlist_head hlist;
+ union {
+ unsigned long kpfn;
+ unsigned long chain_prune_time;
+ };
+ /*
+ * STABLE_NODE_CHAIN can be any negative number in
+ * rmap_hlist_len negative range, but better not -1 to be able
+ * to reliably detect underflows.
+ */
+#define STABLE_NODE_CHAIN -1024
+ int rmap_hlist_len;
+#ifdef CONFIG_NUMA
+ int nid;
+#endif
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @oldchecksum: previous checksum of the page at that virtual address
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+ struct rmap_item *rmap_list;
+ union {
+ struct anon_vma *anon_vma; /* when stable */
+#ifdef CONFIG_NUMA
+ int nid; /* when node of unstable tree */
+#endif
+ };
+ struct mm_struct *mm;
+ unsigned long address; /* + low bits used for flags below */
+ unsigned int oldchecksum; /* when unstable */
+ union {
+ struct rb_node node; /* when node of unstable tree */
+ struct { /* when listed from stable tree */
+ struct stable_node *head;
+ struct hlist_node hlist;
+ };
+ };
+};
+
+#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
+#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
+#define STABLE_FLAG 0x200 /* is listed from the stable tree */
+#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
+ /* to mask all the flags */
+
+/* The stable and unstable tree heads */
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
+
+/* Recently migrated nodes of stable tree, pending proper placement */
+static LIST_HEAD(migrate_nodes);
+#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
+
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct mm_slot ksm_mm_head = {
+ .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+};
+static struct ksm_scan ksm_scan = {
+ .mm_slot = &ksm_mm_head,
+};
+
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
+static struct kmem_cache *mm_slot_cache;
+
+/* The number of nodes in the stable tree */
+static unsigned long ksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long ksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long ksm_pages_unshared;
+
+/* The number of rmap_items in use: to calculate pages_volatile */
+static unsigned long ksm_rmap_items;
+
+/* The number of stable_node chains */
+static unsigned long ksm_stable_node_chains;
+
+/* The number of stable_node dups linked to the stable_node chains */
+static unsigned long ksm_stable_node_dups;
+
+/* Delay in pruning stale stable_node_dups in the stable_node_chains */
+static int ksm_stable_node_chains_prune_millisecs = 2000;
+
+/* Maximum number of page slots sharing a stable node */
+static int ksm_max_page_sharing = 256;
+
+/* Number of pages ksmd should scan in one batch */
+static unsigned int ksm_thread_pages_to_scan = 100;
+
+/* Milliseconds ksmd should sleep between batches */
+static unsigned int ksm_thread_sleep_millisecs = 20;
+
+/* Checksum of an empty (zeroed) page */
+static unsigned int zero_checksum __read_mostly;
+
+/* Whether to merge empty (zeroed) pages with actual zero pages */
+static bool ksm_use_zero_pages __read_mostly;
+
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes 1U
+#define ksm_nr_node_ids 1
+#endif
+
+#define KSM_RUN_STOP 0
+#define KSM_RUN_MERGE 1
+#define KSM_RUN_UNMERGE 2
+#define KSM_RUN_OFFLINE 4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
+
+static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
+static DEFINE_MUTEX(ksm_thread_mutex);
+static DEFINE_SPINLOCK(ksm_mmlist_lock);
+
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+ sizeof(struct __struct), __alignof__(struct __struct),\
+ (__flags), NULL)
+
+static int __init ksm_slab_init(void)
+{
+ rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ if (!rmap_item_cache)
+ goto out;
+
+ stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ if (!stable_node_cache)
+ goto out_free1;
+
+ mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ if (!mm_slot_cache)
+ goto out_free2;
+
+ return 0;
+
+out_free2:
+ kmem_cache_destroy(stable_node_cache);
+out_free1:
+ kmem_cache_destroy(rmap_item_cache);
+out:
+ return -ENOMEM;
+}
+
+static void __init ksm_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ kmem_cache_destroy(stable_node_cache);
+ kmem_cache_destroy(rmap_item_cache);
+ mm_slot_cache = NULL;
+}
+
+static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+{
+ return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
+}
+
+static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+{
+ return dup->head == STABLE_NODE_DUP_HEAD;
+}
+
+static inline void stable_node_chain_add_dup(struct stable_node *dup,
+ struct stable_node *chain)
+{
+ VM_BUG_ON(is_stable_node_dup(dup));
+ dup->head = STABLE_NODE_DUP_HEAD;
+ VM_BUG_ON(!is_stable_node_chain(chain));
+ hlist_add_head(&dup->hlist_dup, &chain->hlist);
+ ksm_stable_node_dups++;
+}
+
+static inline void __stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ hlist_del(&dup->hlist_dup);
+ ksm_stable_node_dups--;
+}
+
+static inline void stable_node_dup_del(struct stable_node *dup)
+{
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (is_stable_node_dup(dup))
+ __stable_node_dup_del(dup);
+ else
+ rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
+#ifdef CONFIG_DEBUG_VM
+ dup->head = NULL;
+#endif
+}
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+ struct rmap_item *rmap_item;
+
+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
+ __GFP_NORETRY | __GFP_NOWARN);
+ if (rmap_item)
+ ksm_rmap_items++;
+ return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+ ksm_rmap_items--;
+ rmap_item->mm = NULL; /* debug safety */
+ kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct stable_node *alloc_stable_node(void)
+{
+ /*
+ * The allocation can take too long with GFP_KERNEL when memory is under
+ * pressure, which may lead to hung task warnings. Adding __GFP_HIGH
+ * grants access to memory reserves, helping to avoid this problem.
+ */
+ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+ VM_BUG_ON(stable_node->rmap_hlist_len &&
+ !is_stable_node_chain(stable_node));
+ kmem_cache_free(stable_node_cache, stable_node);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *slot;
+
+ hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
+ if (slot->mm == mm)
+ return slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
+}
+
+/*
+ * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
+ * page tables after it has passed through ksm_exit() - which, if necessary,
+ * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
+ * a special flag: they can just back out as soon as mm_users goes to zero.
+ * ksm_test_exit() is used throughout to make this test for exit: in some
+ * places for correctness, in some places just to avoid unnecessary work.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
+ * put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ *
+ * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * of the process that owns 'vma'. We also do not want to enforce
+ * protection keys here anyway.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ vm_fault_t ret = 0;
+
+ do {
+ cond_resched();
+ page = follow_page(vma, addr,
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+ if (IS_ERR_OR_NULL(page))
+ break;
+ if (PageKsm(page))
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
+ else
+ ret = VM_FAULT_WRITE;
+ put_page(page);
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+ /*
+ * We must loop because handle_mm_fault() may back out if there's
+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
+ *
+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+ * COW has been broken, even if the vma does not permit VM_WRITE;
+ * but note that a concurrent fault might break PageKsm for us.
+ *
+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
+ * backing file, which also invalidates anonymous pages: that's
+ * okay, that truncation will have unmapped the PageKsm for us.
+ *
+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+ * current task has TIF_MEMDIE set, and will be OOM killed on return
+ * to user; and ksmd, having no mm, would never be chosen for that.
+ *
+ * But if the mm is in a limited mem_cgroup, then the fault may fail
+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+ * even ksmd can fail in this way - though it's usually breaking ksm
+ * just to undo a merge it made a moment before, so unlikely to oom.
+ *
+ * That's a pity: we might therefore have more kernel pages allocated
+ * than we're counting as nodes in the stable tree; but ksm_do_scan
+ * will retry to break_cow on each pass, so should recover the page
+ * in due course. The important thing is to not let VM_MERGEABLE
+ * be cleared while any such pages might remain in the area.
+ */
+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ if (ksm_test_exit(mm))
+ return NULL;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return NULL;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ return NULL;
+ return vma;
+}
+
+static void break_cow(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ put_anon_vma(rmap_item->anon_vma);
+
+ mmap_read_lock(mm);
+ vma = find_mergeable_vma(mm, addr);
+ if (vma)
+ break_ksm(vma, addr);
+ mmap_read_unlock(mm);
+}
+
+static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ mmap_read_lock(mm);
+ vma = find_mergeable_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ page = follow_page(vma, addr, FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ if (PageAnon(page)) {
+ flush_anon_page(vma, page, addr);
+ flush_dcache_page(page);
+ } else {
+ put_page(page);
+out:
+ page = NULL;
+ }
+ mmap_read_unlock(mm);
+ return page;
+}
+
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+ return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
+}
+
+static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+ struct rb_root *root)
+{
+ struct stable_node *chain = alloc_stable_node();
+ VM_BUG_ON(is_stable_node_chain(dup));
+ if (likely(chain)) {
+ INIT_HLIST_HEAD(&chain->hlist);
+ chain->chain_prune_time = jiffies;
+ chain->rmap_hlist_len = STABLE_NODE_CHAIN;
+#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
+ chain->nid = NUMA_NO_NODE; /* debug */
+#endif
+ ksm_stable_node_chains++;
+
+ /*
+ * Put the stable node chain in the first dimension of
+ * the stable tree and at the same time remove the old
+ * stable node.
+ */
+ rb_replace_node(&dup->node, &chain->node, root);
+
+ /*
+ * Move the old stable node to the second dimension
+ * queued in the hlist_dup. The invariant is that all
+ * dup stable_nodes in the chain->hlist point to pages
+ * that are write protected and have the exact same
+ * content.
+ */
+ stable_node_chain_add_dup(dup, chain);
+ }
+ return chain;
+}
+
+static inline void free_stable_node_chain(struct stable_node *chain,
+ struct rb_root *root)
+{
+ rb_erase(&chain->node, root);
+ free_stable_node(chain);
+ ksm_stable_node_chains--;
+}
+
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+
+ /* check it's not STABLE_NODE_CHAIN or negative */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ /*
+ * We need the second aligned pointer of the migrate_nodes
+ * list_head to stay clear from the rb_parent_color union
+ * (aligned and different than any node) and also different
+ * from &migrate_nodes. This will verify that future list.h changes
+ * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
+ */
+#if defined(GCC_VERSION) && GCC_VERSION >= 40903
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
+ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+#endif
+
+ if (stable_node->head == &migrate_nodes)
+ list_del(&stable_node->list);
+ else
+ stable_node_dup_del(stable_node);
+ free_stable_node(stable_node);
+}
+
+enum get_ksm_page_flags {
+ GET_KSM_PAGE_NOLOCK,
+ GET_KSM_PAGE_LOCK,
+ GET_KSM_PAGE_TRYLOCK
+};
+
+/*
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ */
+static struct page *get_ksm_page(struct stable_node *stable_node,
+ enum get_ksm_page_flags flags)
+{
+ struct page *page;
+ void *expected_mapping;
+ unsigned long kpfn;
+
+ expected_mapping = (void *)((unsigned long)stable_node |
+ PAGE_MAPPING_KSM);
+again:
+ kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
+ page = pfn_to_page(kpfn);
+ if (READ_ONCE(page->mapping) != expected_mapping)
+ goto stale;
+
+ /*
+ * We cannot do anything with the page while its refcount is 0.
+ * Usually 0 means free, or tail of a higher-order page: in which
+ * case this node is no longer referenced, and should be freed;
+ * however, it might mean that the page is under page_ref_freeze().
+ * The __remove_mapping() case is easy, again the node is now stale;
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
+ */
+ while (!get_page_unless_zero(page)) {
+ /*
+ * Another check for page->mapping != expected_mapping would
+ * work here too. We have chosen the !PageSwapCache test to
+ * optimize the common case, when the page is or is about to
+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
+ * in the ref_freeze section of __remove_mapping(); but Anon
+ * page->mapping reset to NULL later, in free_pages_prepare().
+ */
+ if (!PageSwapCache(page))
+ goto stale;
+ cpu_relax();
+ }
+
+ if (READ_ONCE(page->mapping) != expected_mapping) {
+ put_page(page);
+ goto stale;
+ }
+
+ if (flags == GET_KSM_PAGE_TRYLOCK) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ERR_PTR(-EBUSY);
+ }
+ } else if (flags == GET_KSM_PAGE_LOCK)
+ lock_page(page);
+
+ if (flags != GET_KSM_PAGE_NOLOCK) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto stale;
+ }
+ }
+ return page;
+
+stale:
+ /*
+ * We come here from above when page->mapping or !PageSwapCache
+ * suggests that the node is stale; but it might be under migration.
+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * before checking whether node->kpfn has been changed.
+ */
+ smp_rmb();
+ if (READ_ONCE(stable_node->kpfn) != kpfn)
+ goto again;
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+ if (rmap_item->address & STABLE_FLAG) {
+ struct stable_node *stable_node;
+ struct page *page;
+
+ stable_node = rmap_item->head;
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
+ if (!page)
+ goto out;
+
+ hlist_del(&rmap_item->hlist);
+ unlock_page(page);
+ put_page(page);
+
+ if (!hlist_empty(&stable_node->hlist))
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+ stable_node->rmap_hlist_len--;
+
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->head = NULL;
+ rmap_item->address &= PAGE_MASK;
+
+ } else if (rmap_item->address & UNSTABLE_FLAG) {
+ unsigned char age;
+ /*
+ * Usually ksmd can and must skip the rb_erase, because
+ * root_unstable_tree was already reset to RB_ROOT.
+ * But be careful when an mm is exiting: do the rb_erase
+ * if this rmap_item was inserted by this scan, rather
+ * than left over from before.
+ */
+ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
+ BUG_ON(age > 1);
+ if (!age)
+ rb_erase(&rmap_item->node,
+ root_unstable_tree + NUMA(rmap_item->nid));
+ ksm_pages_unshared--;
+ rmap_item->address &= PAGE_MASK;
+ }
+out:
+ cond_resched(); /* we're called from many long loops */
+}
+
+static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
+ struct rmap_item **rmap_list)
+{
+ while (*rmap_list) {
+ struct rmap_item *rmap_item = *rmap_list;
+ *rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ }
+}
+
+/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+ int err = 0;
+
+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+ if (ksm_test_exit(vma->vm_mm))
+ break;
+ if (signal_pending(current))
+ err = -ERESTARTSYS;
+ else
+ err = break_ksm(vma, addr);
+ }
+ return err;
+}
+
+static inline struct stable_node *page_stable_node(struct page *page)
+{
+ return PageKsm(page) ? page_rmapping(page) : NULL;
+}
+
+static inline void set_page_stable_node(struct page *page,
+ struct stable_node *stable_node)
+{
+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Only called through the sysfs control interface:
+ */
+static int remove_stable_node(struct stable_node *stable_node)
+{
+ struct page *page;
+ int err;
+
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
+ if (!page) {
+ /*
+ * get_ksm_page did remove_node_from_stable_tree itself.
+ */
+ return 0;
+ }
+
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
+ /*
+ * The stable node did not yet appear stale to get_ksm_page(),
+ * since that allows for an unmapped ksm page to be recognized
+ * right up until it is freed; but the node is safe to remove.
+ * This page might be in a pagevec waiting to be freed,
+ * or it might be PageSwapCache (perhaps under writeback),
+ * or it might have been removed from swapcache a moment ago.
+ */
+ set_page_stable_node(page, NULL);
+ remove_node_from_stable_tree(stable_node);
+ err = 0;
+ }
+
+ unlock_page(page);
+ put_page(page);
+ return err;
+}
+
+static int remove_stable_node_chain(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ if (remove_stable_node(stable_node))
+ return true;
+ else
+ return false;
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ if (remove_stable_node(dup))
+ return true;
+ }
+ BUG_ON(!hlist_empty(&stable_node->hlist));
+ free_stable_node_chain(stable_node, root);
+ return false;
+}
+
+static int remove_all_stable_nodes(void)
+{
+ struct stable_node *stable_node, *next;
+ int nid;
+ int err = 0;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ while (root_stable_tree[nid].rb_node) {
+ stable_node = rb_entry(root_stable_tree[nid].rb_node,
+ struct stable_node, node);
+ if (remove_stable_node_chain(stable_node,
+ root_stable_tree + nid)) {
+ err = -EBUSY;
+ break; /* proceed to next nid */
+ }
+ cond_resched();
+ }
+ }
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
+ if (remove_stable_node(stable_node))
+ err = -EBUSY;
+ cond_resched();
+ }
+ return err;
+}
+
+static int unmerge_and_remove_all_rmap_items(void)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
+ struct mm_slot, mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ for (mm_slot = ksm_scan.mm_slot;
+ mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
+ mm = mm_slot->mm;
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (ksm_test_exit(mm))
+ break;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ continue;
+ err = unmerge_ksm_pages(vma,
+ vma->vm_start, vma->vm_end);
+ if (err)
+ goto error;
+ }
+
+ remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ mmap_read_unlock(mm);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_test_exit(mm)) {
+ hash_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmdrop(mm);
+ } else
+ spin_unlock(&ksm_mmlist_lock);
+ }
+
+ /* Clean up stable nodes, but don't worry if some are still busy */
+ remove_all_stable_nodes();
+ ksm_scan.seqnr = 0;
+ return 0;
+
+error:
+ mmap_read_unlock(mm);
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = &ksm_mm_head;
+ spin_unlock(&ksm_mmlist_lock);
+ return err;
+}
+#endif /* CONFIG_SYSFS */
+
+static u32 calc_checksum(struct page *page)
+{
+ u32 checksum;
+ void *addr = kmap_atomic(page);
+ checksum = xxhash(addr, PAGE_SIZE, 0);
+ kunmap_atomic(addr);
+ return checksum;
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+ pte_t *orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ };
+ int swapped;
+ int err = -EFAULT;
+ struct mmu_notifier_range range;
+
+ pvmw.address = page_address_in_vma(page, vma);
+ if (pvmw.address == -EFAULT)
+ goto out;
+
+ BUG_ON(PageTransCompound(page));
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
+ pvmw.address + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ if (!page_vma_mapped_walk(&pvmw))
+ goto out_mn;
+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
+ goto out_unlock;
+
+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
+ mm_tlb_flush_pending(mm)) {
+ pte_t entry;
+
+ swapped = PageSwapCache(page);
+ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
+ /*
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
+ * take any lock, therefore the check that we are going to make
+ * with the pagecount against the mapcount is racey and
+ * O_DIRECT can happen right after the check.
+ * So we clear the pte and flush the tlb before the check
+ * this assure us that no O_DIRECT can happen after the check
+ * or in the middle of the check.
+ *
+ * No need to notify as we are downgrading page table to read
+ * only not changing it to point to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
+ /*
+ * Check that no O_DIRECT or similar I/O is in progress on the
+ * page
+ */
+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
+ goto out_unlock;
+ }
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+
+ if (pte_protnone(entry))
+ entry = pte_mkclean(pte_clear_savedwrite(entry));
+ else
+ entry = pte_mkclean(pte_wrprotect(entry));
+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
+ }
+ *orig_pte = *pvmw.pte;
+ err = 0;
+
+out_unlock:
+ page_vma_mapped_walk_done(&pvmw);
+out_mn:
+ mmu_notifier_invalidate_range_end(&range);
+out:
+ return err;
+}
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma: vma that holds the pte pointing to page
+ * @page: the page we are replacing by kpage
+ * @kpage: the ksm page we replace page by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+ struct page *kpage, pte_t orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ pte_t *ptep;
+ pte_t newpte;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int err = -EFAULT;
+ struct mmu_notifier_range range;
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte_same(*ptep, orig_pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out_mn;
+ }
+
+ /*
+ * No need to check ksm_use_zero_pages here: we can only have a
+ * zero_page here if ksm_use_zero_pages was enabled already.
+ */
+ if (!is_zero_pfn(page_to_pfn(kpage))) {
+ get_page(kpage);
+ page_add_anon_rmap(kpage, vma, addr, false);
+ newpte = mk_pte(kpage, vma->vm_page_prot);
+ } else {
+ newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
+ vma->vm_page_prot));
+ /*
+ * We're replacing an anonymous page with a zero page, which is
+ * not anonymous. We need to do proper accounting otherwise we
+ * will get wrong values in /proc, and a BUG message in dmesg
+ * when tearing down the mm.
+ */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ /*
+ * No need to notify as we are replacing a read only page with another
+ * read only page with the same content.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, newpte);
+
+ page_remove_rmap(page, false);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ put_page(page);
+
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(&range);
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_one_page - take two pages and merge them into one
+ * @vma: the vma that holds the pte pointing to page
+ * @page: the PageAnon page that we want to replace with kpage
+ * @kpage: the PageKsm page that we want to map instead of page,
+ * or NULL the first time when we want to use page as kpage.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_one_page(struct vm_area_struct *vma,
+ struct page *page, struct page *kpage)
+{
+ pte_t orig_pte = __pte(0);
+ int err = -EFAULT;
+
+ if (page == kpage) /* ksm page forked */
+ return 0;
+
+ if (!PageAnon(page))
+ goto out;
+
+ /*
+ * We need the page lock to read a stable PageSwapCache in
+ * write_protect_page(). We use trylock_page() instead of
+ * lock_page() because we don't want to wait here - we
+ * prefer to continue scanning and merging different pages,
+ * then come back to this page when it is unlocked.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ if (PageTransCompound(page)) {
+ if (split_huge_page(page))
+ goto out_unlock;
+ }
+
+ /*
+ * If this anonymous page is mapped only here, its pte may need
+ * to be write-protected. If it's mapped elsewhere, all of its
+ * ptes are necessarily already write-protected. But in either
+ * case, we need to lock and check page_count is not raised.
+ */
+ if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (!kpage) {
+ /*
+ * While we hold page lock, upgrade page from
+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * stable_tree_insert() will update stable_node.
+ */
+ set_page_stable_node(page, NULL);
+ mark_page_accessed(page);
+ /*
+ * Page reclaim just frees a clean page with no dirty
+ * ptes: make sure that the ksm page would be swapped.
+ */
+ if (!PageDirty(page))
+ SetPageDirty(page);
+ err = 0;
+ } else if (pages_identical(page, kpage))
+ err = replace_page(vma, page, kpage, orig_pte);
+ }
+
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+ munlock_vma_page(page);
+ if (!PageMlocked(kpage)) {
+ unlock_page(page);
+ lock_page(kpage);
+ mlock_vma_page(kpage);
+ page = kpage; /* for final unlock */
+ }
+ }
+
+out_unlock:
+ unlock_page(page);
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
+ * but no new kernel page is allocated: kpage must already be a ksm page.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ struct vm_area_struct *vma;
+ int err = -EFAULT;
+
+ mmap_read_lock(mm);
+ vma = find_mergeable_vma(mm, rmap_item->address);
+ if (!vma)
+ goto out;
+
+ err = try_to_merge_one_page(vma, page, kpage);
+ if (err)
+ goto out;
+
+ /* Unstable nid is in union with stable anon_vma: remove first */
+ remove_rmap_item_from_tree(rmap_item);
+
+ /* Must get reference to anon_vma while still holding mmap_lock */
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
+out:
+ mmap_read_unlock(mm);
+ return err;
+}
+
+/*
+ * try_to_merge_two_pages - take two identical pages and prepare them
+ * to be merged into one page.
+ *
+ * This function returns the kpage if we successfully merged two identical
+ * pages into one ksm page, NULL otherwise.
+ *
+ * Note that this function upgrades page to ksm page: if one of the pages
+ * is already a ksm page, try_to_merge_with_ksm_page should be used.
+ */
+static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+ struct page *page,
+ struct rmap_item *tree_rmap_item,
+ struct page *tree_page)
+{
+ int err;
+
+ err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
+ if (!err) {
+ err = try_to_merge_with_ksm_page(tree_rmap_item,
+ tree_page, page);
+ /*
+ * If that fails, we have a ksm page with only one pte
+ * pointing to it: so break it.
+ */
+ if (err)
+ break_cow(rmap_item);
+ }
+ return err ? NULL : page;
+}
+
+static __always_inline
+bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+{
+ VM_BUG_ON(stable_node->rmap_hlist_len < 0);
+ /*
+ * Check that at least one mapping still exists, otherwise
+ * there's no much point to merge and share with this
+ * stable_node, as the underlying tree_page of the other
+ * sharer is going to be freed soon.
+ */
+ return stable_node->rmap_hlist_len &&
+ stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
+}
+
+static __always_inline
+bool is_page_sharing_candidate(struct stable_node *stable_node)
+{
+ return __is_page_sharing_candidate(stable_node, 0);
+}
+
+static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct hlist_node *hlist_safe;
+ struct page *_tree_page, *tree_page = NULL;
+ int nr = 0;
+ int found_rmap_hlist_len;
+
+ if (!prune_stale_stable_nodes ||
+ time_before(jiffies, stable_node->chain_prune_time +
+ msecs_to_jiffies(
+ ksm_stable_node_chains_prune_millisecs)))
+ prune_stale_stable_nodes = false;
+ else
+ stable_node->chain_prune_time = jiffies;
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ cond_resched();
+ /*
+ * We must walk all stable_node_dup to prune the stale
+ * stable nodes during lookup.
+ *
+ * get_ksm_page can drop the nodes from the
+ * stable_node->hlist if they point to freed pages
+ * (that's why we do a _safe walk). The "dup"
+ * stable_node parameter itself will be freed from
+ * under us if it returns NULL.
+ */
+ _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
+ if (!_tree_page)
+ continue;
+ nr += 1;
+ if (is_page_sharing_candidate(dup)) {
+ if (!found ||
+ dup->rmap_hlist_len > found_rmap_hlist_len) {
+ if (found)
+ put_page(tree_page);
+ found = dup;
+ found_rmap_hlist_len = found->rmap_hlist_len;
+ tree_page = _tree_page;
+
+ /* skip put_page for found dup */
+ if (!prune_stale_stable_nodes)
+ break;
+ continue;
+ }
+ }
+ put_page(_tree_page);
+ }
+
+ if (found) {
+ /*
+ * nr is counting all dups in the chain only if
+ * prune_stale_stable_nodes is true, otherwise we may
+ * break the loop at nr == 1 even if there are
+ * multiple entries.
+ */
+ if (prune_stale_stable_nodes && nr == 1) {
+ /*
+ * If there's not just one entry it would
+ * corrupt memory, better BUG_ON. In KSM
+ * context with no lock held it's not even
+ * fatal.
+ */
+ BUG_ON(stable_node->hlist.first->next);
+
+ /*
+ * There's just one entry and it is below the
+ * deduplication limit so drop the chain.
+ */
+ rb_replace_node(&stable_node->node, &found->node,
+ root);
+ free_stable_node(stable_node);
+ ksm_stable_node_chains--;
+ ksm_stable_node_dups--;
+ /*
+ * NOTE: the caller depends on the stable_node
+ * to be equal to stable_node_dup if the chain
+ * was collapsed.
+ */
+ *_stable_node = found;
+ /*
+ * Just for robustneess as stable_node is
+ * otherwise left as a stable pointer, the
+ * compiler shall optimize it away at build
+ * time.
+ */
+ stable_node = NULL;
+ } else if (stable_node->hlist.first != &found->hlist_dup &&
+ __is_page_sharing_candidate(found, 1)) {
+ /*
+ * If the found stable_node dup can accept one
+ * more future merge (in addition to the one
+ * that is underway) and is not at the head of
+ * the chain, put it there so next search will
+ * be quicker in the !prune_stale_stable_nodes
+ * case.
+ *
+ * NOTE: it would be inaccurate to use nr > 1
+ * instead of checking the hlist.first pointer
+ * directly, because in the
+ * prune_stale_stable_nodes case "nr" isn't
+ * the position of the found dup in the chain,
+ * but the total number of dups in the chain.
+ */
+ hlist_del(&found->hlist_dup);
+ hlist_add_head(&found->hlist_dup,
+ &stable_node->hlist);
+ }
+ }
+
+ *_stable_node_dup = found;
+ return tree_page;
+}
+
+static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+ struct rb_root *root)
+{
+ if (!is_stable_node_chain(stable_node))
+ return stable_node;
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return NULL;
+ }
+ return hlist_entry(stable_node->hlist.first,
+ typeof(*stable_node), hlist_dup);
+}
+
+/*
+ * Like for get_ksm_page, this function can free the *_stable_node and
+ * *_stable_node_dup if the returned tree_page is NULL.
+ *
+ * It can also free and overwrite *_stable_node with the found
+ * stable_node_dup if the chain is collapsed (in which case
+ * *_stable_node will be equal to *_stable_node_dup like if the chain
+ * never existed). It's up to the caller to verify tree_page is not
+ * NULL before dereferencing *_stable_node or *_stable_node_dup.
+ *
+ * *_stable_node_dup is really a second output parameter of this
+ * function and will be overwritten in all cases, the caller doesn't
+ * need to initialize it.
+ */
+static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
+ struct stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
+{
+ struct stable_node *stable_node = *_stable_node;
+ if (!is_stable_node_chain(stable_node)) {
+ if (is_page_sharing_candidate(stable_node)) {
+ *_stable_node_dup = stable_node;
+ return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
+ }
+ /*
+ * _stable_node_dup set to NULL means the stable_node
+ * reached the ksm_max_page_sharing limit.
+ */
+ *_stable_node_dup = NULL;
+ return NULL;
+ }
+ return stable_node_dup(_stable_node_dup, _stable_node, root,
+ prune_stale_stable_nodes);
+}
+
+static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
+ struct stable_node **s_n,
+ struct rb_root *root)
+{
+ return __stable_node_chain(s_n_d, s_n, root, true);
+}
+
+static __always_inline struct page *chain(struct stable_node **s_n_d,
+ struct stable_node *s_n,
+ struct rb_root *root)
+{
+ struct stable_node *old_stable_node = s_n;
+ struct page *tree_page;
+
+ tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
+ /* not pruning dups so s_n cannot have changed */
+ VM_BUG_ON(s_n != old_stable_node);
+ return tree_page;
+}
+
+/*
+ * stable_tree_search - search for page inside the stable tree
+ *
+ * This function checks if there is a page inside the stable tree
+ * with identical content to the page that we are scanning right now.
+ *
+ * This function returns the stable tree node of identical content if found,
+ * NULL otherwise.
+ */
+static struct page *stable_tree_search(struct page *page)
+{
+ int nid;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct stable_node *page_node;
+
+ page_node = page_stable_node(page);
+ if (page_node && page_node->head != &migrate_nodes) {
+ /* ksm page forked */
+ get_page(page);
+ return page;
+ }
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_stable_tree + nid;
+again:
+ new = &root->rb_node;
+ parent = NULL;
+
+ while (*new) {
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node_any = NULL;
+ tree_page = chain_prune(&stable_node_dup, &stable_node, root);
+ /*
+ * NOTE: stable_node may have been freed by
+ * chain_prune() if the returned stable_node_dup is
+ * not NULL. stable_node_dup may have been inserted in
+ * the rbtree instead as a regular stable_node (in
+ * order to collapse the stable_node chain if a single
+ * stable_node dup was found in it). In such case the
+ * stable_node is overwritten by the calleee to point
+ * to the stable_node_dup that was collapsed in the
+ * stable rbtree and stable_node will be equal to
+ * stable_node_dup like if the chain never existed.
+ */
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * write protected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
+ if (!tree_page) {
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale.
+ */
+ goto again;
+ }
+
+ ret = memcmp_pages(page, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ /*
+ * Test if the migrated page should be merged
+ * into a stable node dup. If the mapcount is
+ * 1 we can migrate it with another KSM page
+ * without adding it to the chain.
+ */
+ if (page_mapcount(page) > 1)
+ goto chain_append;
+ }
+
+ if (!stable_node_dup) {
+ /*
+ * If the stable_node is a chain and
+ * we got a payload match in memcmp
+ * but we cannot merge the scanned
+ * page in any of the existing
+ * stable_node dups because they're
+ * all full, we need to wait the
+ * scanned page to find itself a match
+ * in the unstable tree to create a
+ * brand new KSM page to add later to
+ * the dups of this stable_node.
+ */
+ return NULL;
+ }
+
+ /*
+ * Lock and unlock the stable_node's page (which
+ * might already have been migrated) so that page
+ * migration is sure to notice its raised count.
+ * It would be more elegant to return stable_node
+ * than kpage, but that involves more changes.
+ */
+ tree_page = get_ksm_page(stable_node_dup,
+ GET_KSM_PAGE_TRYLOCK);
+
+ if (PTR_ERR(tree_page) == -EBUSY)
+ return ERR_PTR(-EBUSY);
+
+ if (unlikely(!tree_page))
+ /*
+ * The tree may have been rebalanced,
+ * so re-evaluate parent and new.
+ */
+ goto again;
+ unlock_page(tree_page);
+
+ if (get_kpfn_nid(stable_node_dup->kpfn) !=
+ NUMA(stable_node_dup->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
+ }
+ }
+
+ if (!page_node)
+ return NULL;
+
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_link_node(&page_node->node, parent, new);
+ rb_insert_color(&page_node->node, root);
+out:
+ if (is_page_sharing_candidate(page_node)) {
+ get_page(page);
+ return page;
+ } else
+ return NULL;
+
+replace:
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* there is no chain */
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node_dup->node,
+ &page_node->node,
+ root);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ rb_erase(&stable_node_dup->node, root);
+ page = NULL;
+ }
+ } else {
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ __stable_node_dup_del(stable_node_dup);
+ if (page_node) {
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ if (is_page_sharing_candidate(page_node))
+ get_page(page);
+ else
+ page = NULL;
+ } else {
+ page = NULL;
+ }
+ }
+ stable_node_dup->head = &migrate_nodes;
+ list_add(&stable_node_dup->list, stable_node_dup->head);
+ return page;
+
+chain_append:
+ /* stable_node_dup could be null if it reached the limit */
+ if (!stable_node_dup)
+ stable_node_dup = stable_node_any;
+ /*
+ * If stable_node was a chain and chain_prune collapsed it,
+ * stable_node has been updated to be the new regular
+ * stable_node. A collapse of the chain is indistinguishable
+ * from the case there was no chain in the stable
+ * rbtree. Otherwise stable_node is the chain and
+ * stable_node_dup is the dup to replace.
+ */
+ if (stable_node_dup == stable_node) {
+ VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+ VM_BUG_ON(is_stable_node_dup(stable_node_dup));
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(stable_node_dup,
+ root);
+ if (!stable_node)
+ return NULL;
+ }
+ /*
+ * Add this stable_node dup that was
+ * migrated to the stable_node chain
+ * of the current nid for this page
+ * content.
+ */
+ VM_BUG_ON(!is_stable_node_chain(stable_node));
+ VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
+ VM_BUG_ON(page_node->head != &migrate_nodes);
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ stable_node_chain_add_dup(page_node, stable_node);
+ goto out;
+}
+
+/*
+ * stable_tree_insert - insert stable tree node pointing to new ksm page
+ * into the stable tree.
+ *
+ * This function returns the stable tree node just allocated on success,
+ * NULL otherwise.
+ */
+static struct stable_node *stable_tree_insert(struct page *kpage)
+{
+ int nid;
+ unsigned long kpfn;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
+ struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ bool need_chain = false;
+
+ kpfn = page_to_pfn(kpage);
+ nid = get_kpfn_nid(kpfn);
+ root = root_stable_tree + nid;
+again:
+ parent = NULL;
+ new = &root->rb_node;
+
+ while (*new) {
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node_any = NULL;
+ tree_page = chain(&stable_node_dup, stable_node, root);
+ if (!stable_node_dup) {
+ /*
+ * Either all stable_node dups were full in
+ * this stable_node chain, or this chain was
+ * empty and should be rb_erased.
+ */
+ stable_node_any = stable_node_dup_any(stable_node,
+ root);
+ if (!stable_node_any) {
+ /* rb_erase just run */
+ goto again;
+ }
+ /*
+ * Take any of the stable_node dups page of
+ * this stable_node chain to let the tree walk
+ * continue. All KSM pages belonging to the
+ * stable_node dups in a stable_node chain
+ * have the same content and they're
+ * write protected at all times. Any will work
+ * fine to continue the walk.
+ */
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
+ }
+ VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
+ if (!tree_page) {
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale.
+ */
+ goto again;
+ }
+
+ ret = memcmp_pages(kpage, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ need_chain = true;
+ break;
+ }
+ }
+
+ stable_node_dup = alloc_stable_node();
+ if (!stable_node_dup)
+ return NULL;
+
+ INIT_HLIST_HEAD(&stable_node_dup->hlist);
+ stable_node_dup->kpfn = kpfn;
+ set_page_stable_node(kpage, stable_node_dup);
+ stable_node_dup->rmap_hlist_len = 0;
+ DO_NUMA(stable_node_dup->nid = nid);
+ if (!need_chain) {
+ rb_link_node(&stable_node_dup->node, parent, new);
+ rb_insert_color(&stable_node_dup->node, root);
+ } else {
+ if (!is_stable_node_chain(stable_node)) {
+ struct stable_node *orig = stable_node;
+ /* chain is missing so create it */
+ stable_node = alloc_stable_node_chain(orig, root);
+ if (!stable_node) {
+ free_stable_node(stable_node_dup);
+ return NULL;
+ }
+ }
+ stable_node_chain_add_dup(stable_node_dup, stable_node);
+ }
+
+ return stable_node_dup;
+}
+
+/*
+ * unstable_tree_search_insert - search for identical page,
+ * else insert rmap_item into the unstable tree.
+ *
+ * This function searches for a page in the unstable tree identical to the
+ * page currently being scanned; and if no identical page is found in the
+ * tree, we insert rmap_item as a new object into the unstable tree.
+ *
+ * This function returns pointer to rmap_item found to be identical
+ * to the currently scanned page, NULL otherwise.
+ *
+ * This function does both searching and inserting, because they share
+ * the same walking algorithm in an rbtree.
+ */
+static
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+ struct page *page,
+ struct page **tree_pagep)
+{
+ struct rb_node **new;
+ struct rb_root *root;
+ struct rb_node *parent = NULL;
+ int nid;
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_unstable_tree + nid;
+ new = &root->rb_node;
+
+ while (*new) {
+ struct rmap_item *tree_rmap_item;
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ tree_page = get_mergeable_page(tree_rmap_item);
+ if (!tree_page)
+ return NULL;
+
+ /*
+ * Don't substitute a ksm page for a forked page.
+ */
+ if (page == tree_page) {
+ put_page(tree_page);
+ return NULL;
+ }
+
+ ret = memcmp_pages(page, tree_page);
+
+ parent = *new;
+ if (ret < 0) {
+ put_page(tree_page);
+ new = &parent->rb_left;
+ } else if (ret > 0) {
+ put_page(tree_page);
+ new = &parent->rb_right;
+ } else if (!ksm_merge_across_nodes &&
+ page_to_nid(tree_page) != nid) {
+ /*
+ * If tree_page has been migrated to another NUMA node,
+ * it will be flushed out and put in the right unstable
+ * tree next time: only merge with it when across_nodes.
+ */
+ put_page(tree_page);
+ return NULL;
+ } else {
+ *tree_pagep = tree_page;
+ return tree_rmap_item;
+ }
+ }
+
+ rmap_item->address |= UNSTABLE_FLAG;
+ rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+ DO_NUMA(rmap_item->nid = nid);
+ rb_link_node(&rmap_item->node, parent, new);
+ rb_insert_color(&rmap_item->node, root);
+
+ ksm_pages_unshared++;
+ return NULL;
+}
+
+/*
+ * stable_tree_append - add another rmap_item to the linked list of
+ * rmap_items hanging off a given node of the stable tree, all sharing
+ * the same ksm page.
+ */
+static void stable_tree_append(struct rmap_item *rmap_item,
+ struct stable_node *stable_node,
+ bool max_page_sharing_bypass)
+{
+ /*
+ * rmap won't find this mapping if we don't insert the
+ * rmap_item in the right stable_node
+ * duplicate. page_migration could break later if rmap breaks,
+ * so we can as well crash here. We really need to check for
+ * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
+ * for other negative values as an underflow if detected here
+ * for the first time (and not when decreasing rmap_hlist_len)
+ * would be sign of memory corruption in the stable_node.
+ */
+ BUG_ON(stable_node->rmap_hlist_len < 0);
+
+ stable_node->rmap_hlist_len++;
+ if (!max_page_sharing_bypass)
+ /* possibly non fatal but unexpected overflow, only warn */
+ WARN_ON_ONCE(stable_node->rmap_hlist_len >
+ ksm_max_page_sharing);
+
+ rmap_item->head = stable_node;
+ rmap_item->address |= STABLE_FLAG;
+ hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
+
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing++;
+ else
+ ksm_pages_shared++;
+}
+
+/*
+ * cmp_and_merge_page - first see if page can be merged into the stable tree;
+ * if not, compare checksum to previous and if it's the same, see if page can
+ * be inserted into the unstable tree, or merged with a page already there and
+ * both transferred to the stable tree.
+ *
+ * @page: the page that we are searching identical page to.
+ * @rmap_item: the reverse mapping into the virtual address of this page
+ */
+static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ struct rmap_item *tree_rmap_item;
+ struct page *tree_page = NULL;
+ struct stable_node *stable_node;
+ struct page *kpage;
+ unsigned int checksum;
+ int err;
+ bool max_page_sharing_bypass = false;
+
+ stable_node = page_stable_node(page);
+ if (stable_node) {
+ if (stable_node->head != &migrate_nodes &&
+ get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
+ NUMA(stable_node->nid)) {
+ stable_node_dup_del(stable_node);
+ stable_node->head = &migrate_nodes;
+ list_add(&stable_node->list, stable_node->head);
+ }
+ if (stable_node->head != &migrate_nodes &&
+ rmap_item->head == stable_node)
+ return;
+ /*
+ * If it's a KSM fork, allow it to go over the sharing limit
+ * without warnings.
+ */
+ if (!is_page_sharing_candidate(stable_node))
+ max_page_sharing_bypass = true;
+ }
+
+ /* We first start with searching the page inside the stable tree */
+ kpage = stable_tree_search(page);
+ if (kpage == page && rmap_item->head == stable_node) {
+ put_page(kpage);
+ return;
+ }
+
+ remove_rmap_item_from_tree(rmap_item);
+
+ if (kpage) {
+ if (PTR_ERR(kpage) == -EBUSY)
+ return;
+
+ err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
+ if (!err) {
+ /*
+ * The page was successfully merged:
+ * add its rmap_item to the stable tree.
+ */
+ lock_page(kpage);
+ stable_tree_append(rmap_item, page_stable_node(kpage),
+ max_page_sharing_bypass);
+ unlock_page(kpage);
+ }
+ put_page(kpage);
+ return;
+ }
+
+ /*
+ * If the hash value of the page has changed from the last time
+ * we calculated it, this page is changing frequently: therefore we
+ * don't want to insert it in the unstable tree, and we don't want
+ * to waste our time searching for something identical to it there.
+ */
+ checksum = calc_checksum(page);
+ if (rmap_item->oldchecksum != checksum) {
+ rmap_item->oldchecksum = checksum;
+ return;
+ }
+
+ /*
+ * Same checksum as an empty page. We attempt to merge it with the
+ * appropriate zero page if the user enabled this via sysfs.
+ */
+ if (ksm_use_zero_pages && (checksum == zero_checksum)) {
+ struct vm_area_struct *vma;
+
+ mmap_read_lock(mm);
+ vma = find_mergeable_vma(mm, rmap_item->address);
+ if (vma) {
+ err = try_to_merge_one_page(vma, page,
+ ZERO_PAGE(rmap_item->address));
+ } else {
+ /*
+ * If the vma is out of date, we do not need to
+ * continue.
+ */
+ err = 0;
+ }
+ mmap_read_unlock(mm);
+ /*
+ * In case of failure, the page was not really empty, so we
+ * need to continue. Otherwise we're done.
+ */
+ if (!err)
+ return;
+ }
+ tree_rmap_item =
+ unstable_tree_search_insert(rmap_item, page, &tree_page);
+ if (tree_rmap_item) {
+ bool split;
+
+ kpage = try_to_merge_two_pages(rmap_item, page,
+ tree_rmap_item, tree_page);
+ /*
+ * If both pages we tried to merge belong to the same compound
+ * page, then we actually ended up increasing the reference
+ * count of the same compound page twice, and split_huge_page
+ * failed.
+ * Here we set a flag if that happened, and we use it later to
+ * try split_huge_page again. Since we call put_page right
+ * afterwards, the reference count will be correct and
+ * split_huge_page should succeed.
+ */
+ split = PageTransCompound(page)
+ && compound_head(page) == compound_head(tree_page);
+ put_page(tree_page);
+ if (kpage) {
+ /*
+ * The pages were successfully merged: insert new
+ * node in the stable tree and add both rmap_items.
+ */
+ lock_page(kpage);
+ stable_node = stable_tree_insert(kpage);
+ if (stable_node) {
+ stable_tree_append(tree_rmap_item, stable_node,
+ false);
+ stable_tree_append(rmap_item, stable_node,
+ false);
+ }
+ unlock_page(kpage);
+
+ /*
+ * If we fail to insert the page into the stable tree,
+ * we will have 2 virtual addresses that are pointing
+ * to a ksm page left outside the stable tree,
+ * in which case we need to break_cow on both.
+ */
+ if (!stable_node) {
+ break_cow(tree_rmap_item);
+ break_cow(rmap_item);
+ }
+ } else if (split) {
+ /*
+ * We are here if we tried to merge two pages and
+ * failed because they both belonged to the same
+ * compound page. We will split the page now, but no
+ * merging will take place.
+ * We do not want to add the cost of a full lock; if
+ * the page is locked, it is better to skip it and
+ * perhaps try again later.
+ */
+ if (!trylock_page(page))
+ return;
+ split_huge_page(page);
+ unlock_page(page);
+ }
+ }
+}
+
+static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
+ struct rmap_item **rmap_list,
+ unsigned long addr)
+{
+ struct rmap_item *rmap_item;
+
+ while (*rmap_list) {
+ rmap_item = *rmap_list;
+ if ((rmap_item->address & PAGE_MASK) == addr)
+ return rmap_item;
+ if (rmap_item->address > addr)
+ break;
+ *rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ }
+
+ rmap_item = alloc_rmap_item();
+ if (rmap_item) {
+ /* It has already been zeroed */
+ rmap_item->mm = mm_slot->mm;
+ rmap_item->address = addr;
+ rmap_item->rmap_list = *rmap_list;
+ *rmap_list = rmap_item;
+ }
+ return rmap_item;
+}
+
+static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+{
+ struct mm_struct *mm;
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ struct rmap_item *rmap_item;
+ int nid;
+
+ if (list_empty(&ksm_mm_head.mm_list))
+ return NULL;
+
+ slot = ksm_scan.mm_slot;
+ if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
+ /*
+ * Whereas stale stable_nodes on the stable_tree itself
+ * get pruned in the regular course of stable_tree_search(),
+ * those moved out to the migrate_nodes list can accumulate:
+ * so prune them once before each full scan.
+ */
+ if (!ksm_merge_across_nodes) {
+ struct stable_node *stable_node, *next;
+ struct page *page;
+
+ list_for_each_entry_safe(stable_node, next,
+ &migrate_nodes, list) {
+ page = get_ksm_page(stable_node,
+ GET_KSM_PAGE_NOLOCK);
+ if (page)
+ put_page(page);
+ cond_resched();
+ }
+ }
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++)
+ root_unstable_tree[nid] = RB_ROOT;
+
+ spin_lock(&ksm_mmlist_lock);
+ slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
+ ksm_scan.mm_slot = slot;
+ spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
+next_mm:
+ ksm_scan.address = 0;
+ ksm_scan.rmap_list = &slot->rmap_list;
+ }
+
+ mm = slot->mm;
+ mmap_read_lock(mm);
+ if (ksm_test_exit(mm))
+ vma = NULL;
+ else
+ vma = find_vma(mm, ksm_scan.address);
+
+ for (; vma; vma = vma->vm_next) {
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ continue;
+ if (ksm_scan.address < vma->vm_start)
+ ksm_scan.address = vma->vm_start;
+ if (!vma->anon_vma)
+ ksm_scan.address = vma->vm_end;
+
+ while (ksm_scan.address < vma->vm_end) {
+ if (ksm_test_exit(mm))
+ break;
+ *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page)) {
+ flush_anon_page(vma, *page, ksm_scan.address);
+ flush_dcache_page(*page);
+ rmap_item = get_next_rmap_item(slot,
+ ksm_scan.rmap_list, ksm_scan.address);
+ if (rmap_item) {
+ ksm_scan.rmap_list =
+ &rmap_item->rmap_list;
+ ksm_scan.address += PAGE_SIZE;
+ } else
+ put_page(*page);
+ mmap_read_unlock(mm);
+ return rmap_item;
+ }
+ put_page(*page);
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ }
+ }
+
+ if (ksm_test_exit(mm)) {
+ ksm_scan.address = 0;
+ ksm_scan.rmap_list = &slot->rmap_list;
+ }
+ /*
+ * Nuke all the rmap_items that are above this current rmap:
+ * because there were no VM_MERGEABLE vmas with such addresses.
+ */
+ remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_scan.address == 0) {
+ /*
+ * We've completed a full scan of all vmas, holding mmap_lock
+ * throughout, and found no VM_MERGEABLE: so do the same as
+ * __ksm_exit does to remove this mm from all our lists now.
+ * This applies either when cleaning up after __ksm_exit
+ * (but beware: we can reach here even before __ksm_exit),
+ * or when all VM_MERGEABLE areas have been unmapped (and
+ * mmap_lock then protects against race with MADV_MERGEABLE).
+ */
+ hash_del(&slot->link);
+ list_del(&slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmap_read_unlock(mm);
+ mmdrop(mm);
+ } else {
+ mmap_read_unlock(mm);
+ /*
+ * mmap_read_unlock(mm) first because after
+ * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+ * already have been freed under us by __ksm_exit()
+ * because the "mm_slot" is still hashed and
+ * ksm_scan.mm_slot doesn't point to it anymore.
+ */
+ spin_unlock(&ksm_mmlist_lock);
+ }
+
+ /* Repeat until we've completed scanning the whole list */
+ slot = ksm_scan.mm_slot;
+ if (slot != &ksm_mm_head)
+ goto next_mm;
+
+ ksm_scan.seqnr++;
+ return NULL;
+}
+
+/**
+ * ksm_do_scan - the ksm scanner main worker function.
+ * @scan_npages: number of pages we want to scan before we return.
+ */
+static void ksm_do_scan(unsigned int scan_npages)
+{
+ struct rmap_item *rmap_item;
+ struct page *page;
+
+ while (scan_npages-- && likely(!freezing(current))) {
+ cond_resched();
+ rmap_item = scan_get_next_rmap_item(&page);
+ if (!rmap_item)
+ return;
+ cmp_and_merge_page(page, rmap_item);
+ put_page(page);
+ }
+}
+
+static int ksmd_should_run(void)
+{
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+}
+
+static int ksm_scan_thread(void *nothing)
+{
+ unsigned int sleep_ms;
+
+ set_freezable();
+ set_user_nice(current, 5);
+
+ while (!kthread_should_stop()) {
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksmd_should_run())
+ ksm_do_scan(ksm_thread_pages_to_scan);
+ mutex_unlock(&ksm_thread_mutex);
+
+ try_to_freeze();
+
+ if (ksmd_should_run()) {
+ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+ wait_event_interruptible_timeout(ksm_iter_wait,
+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+ msecs_to_jiffies(sleep_ms));
+ } else {
+ wait_event_freezable(ksm_thread_wait,
+ ksmd_should_run() || kthread_should_stop());
+ }
+ }
+ return 0;
+}
+
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, int advice, unsigned long *vm_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ switch (advice) {
+ case MADV_MERGEABLE:
+ /*
+ * Be somewhat over-protective for now!
+ */
+ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_HUGETLB | VM_MIXEDMAP))
+ return 0; /* just ignore the advice */
+
+ if (vma_is_dax(vma))
+ return 0;
+
+#ifdef VM_SAO
+ if (*vm_flags & VM_SAO)
+ return 0;
+#endif
+#ifdef VM_SPARC_ADI
+ if (*vm_flags & VM_SPARC_ADI)
+ return 0;
+#endif
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ err = __ksm_enter(mm);
+ if (err)
+ return err;
+ }
+
+ *vm_flags |= VM_MERGEABLE;
+ break;
+
+ case MADV_UNMERGEABLE:
+ if (!(*vm_flags & VM_MERGEABLE))
+ return 0; /* just ignore the advice */
+
+ if (vma->anon_vma) {
+ err = unmerge_ksm_pages(vma, start, end);
+ if (err)
+ return err;
+ }
+
+ *vm_flags &= ~VM_MERGEABLE;
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ksm_madvise);
+
+int __ksm_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int needs_wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* Check ksm_run too? Would need tighter locking */
+ needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * When KSM_RUN_MERGE (or KSM_RUN_STOP),
+ * insert just behind the scanning cursor, to let the area settle
+ * down a little; when fork is followed by immediate exec, we don't
+ * want ksmd to waste time setting up and tearing down an rmap_list.
+ *
+ * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
+ * scanning cursor, otherwise KSM pages in newly forked mms will be
+ * missed: then we might as well insert at the end of the list.
+ */
+ if (ksm_run & KSM_RUN_UNMERGE)
+ list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ else
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmgrab(mm);
+
+ if (needs_wakeup)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return 0;
+}
+
+void __ksm_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int easy_to_free = 0;
+
+ /*
+ * This process is exiting: if it's straightforward (as is the
+ * case when ksmd was never running), free mm_slot immediately.
+ * But if it's at the cursor or has rmap_items linked to it, use
+ * mmap_lock to synchronize with any break_cows before pagetables
+ * are freed, and leave the mm_slot on the list for ksmd to free.
+ * Beware: ksm may already have noticed it exiting and freed the slot.
+ */
+
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+ if (!mm_slot->rmap_list) {
+ hash_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ easy_to_free = 1;
+ } else {
+ list_move(&mm_slot->mm_list,
+ &ksm_scan.mm_slot->mm_list);
+ }
+ }
+ spin_unlock(&ksm_mmlist_lock);
+
+ if (easy_to_free) {
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
+ }
+}
+
+struct page *ksm_might_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = page_anon_vma(page);
+ struct page *new_page;
+
+ if (PageKsm(page)) {
+ if (page_stable_node(page) &&
+ !(ksm_run & KSM_RUN_UNMERGE))
+ return page; /* no need to copy it */
+ } else if (!anon_vma) {
+ return page; /* no need to copy it */
+ } else if (anon_vma->root == vma->anon_vma->root &&
+ page->index == linear_page_index(vma, address)) {
+ return page; /* still no need to copy it */
+ }
+ if (!PageUptodate(page))
+ return page; /* let do_swap_page report the error */
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ new_page = NULL;
+ }
+ if (new_page) {
+ copy_user_highpage(new_page, page, address, vma);
+
+ SetPageDirty(new_page);
+ __SetPageUptodate(new_page);
+ __SetPageLocked(new_page);
+ }
+
+ return new_page;
+}
+
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct stable_node *stable_node;
+ struct rmap_item *rmap_item;
+ int search_new_forks = 0;
+
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
+
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return;
+again:
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
+ struct vm_area_struct *vma;
+
+ cond_resched();
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
+ unsigned long addr;
+
+ cond_resched();
+ vma = vmac->vma;
+
+ /* Ignore the stable/unstable/sqnr flags */
+ addr = rmap_item->address & ~KSM_FLAG_MASK;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
+ anon_vma_unlock_read(anon_vma);
+ return;
+ }
+ if (rwc->done && rwc->done(page)) {
+ anon_vma_unlock_read(anon_vma);
+ return;
+ }
+ }
+ anon_vma_unlock_read(anon_vma);
+ }
+ if (!search_new_forks++)
+ goto again;
+}
+
+#ifdef CONFIG_MIGRATION
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+ struct stable_node *stable_node;
+
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+
+ stable_node = page_stable_node(newpage);
+ if (stable_node) {
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
+ stable_node->kpfn = page_to_pfn(newpage);
+ /*
+ * newpage->mapping was set in advance; now we need smp_wmb()
+ * to make sure that the new stable_node->kpfn is visible
+ * to get_ksm_page() before it can see that oldpage->mapping
+ * has gone stale (or that PageSwapCache has been cleared).
+ */
+ smp_wmb();
+ set_page_stable_node(oldpage, NULL);
+ }
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void wait_while_offlining(void)
+{
+ while (ksm_run & KSM_RUN_OFFLINE) {
+ mutex_unlock(&ksm_thread_mutex);
+ wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
+ TASK_UNINTERRUPTIBLE);
+ mutex_lock(&ksm_thread_mutex);
+ }
+}
+
+static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ return true;
+ }
+ return false;
+}
+
+static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ struct rb_root *root)
+{
+ struct stable_node *dup;
+ struct hlist_node *hlist_safe;
+
+ if (!is_stable_node_chain(stable_node)) {
+ VM_BUG_ON(is_stable_node_dup(stable_node));
+ return stable_node_dup_remove_range(stable_node, start_pfn,
+ end_pfn);
+ }
+
+ hlist_for_each_entry_safe(dup, hlist_safe,
+ &stable_node->hlist, hlist_dup) {
+ VM_BUG_ON(!is_stable_node_dup(dup));
+ stable_node_dup_remove_range(dup, start_pfn, end_pfn);
+ }
+ if (hlist_empty(&stable_node->hlist)) {
+ free_stable_node_chain(stable_node, root);
+ return true; /* notify caller that tree was rebalanced */
+ } else
+ return false;
+}
+
+static void ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct stable_node *stable_node, *next;
+ struct rb_node *node;
+ int nid;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ node = rb_first(root_stable_tree + nid);
+ while (node) {
+ stable_node = rb_entry(node, struct stable_node, node);
+ if (stable_node_chain_remove_range(stable_node,
+ start_pfn, end_pfn,
+ root_stable_tree +
+ nid))
+ node = rb_first(root_stable_tree + nid);
+ else
+ node = rb_next(node);
+ cond_resched();
+ }
+ }
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn)
+ remove_node_from_stable_tree(stable_node);
+ cond_resched();
+ }
+}
+
+static int ksm_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
+ * and remove_all_stable_nodes() while memory is going offline:
+ * it is unsafe for them to touch the stable tree at this time.
+ * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * which do not need the ksm_thread_mutex are all safe.
+ */
+ mutex_lock(&ksm_thread_mutex);
+ ksm_run |= KSM_RUN_OFFLINE;
+ mutex_unlock(&ksm_thread_mutex);
+ break;
+
+ case MEM_OFFLINE:
+ /*
+ * Most of the work is done by page migration; but there might
+ * be a few stable_nodes left over, still pointing to struct
+ * pages which have been offlined: prune those from the tree,
+ * otherwise get_ksm_page() might later try to access a
+ * non-existent struct page.
+ */
+ ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages);
+ fallthrough;
+ case MEM_CANCEL_OFFLINE:
+ mutex_lock(&ksm_thread_mutex);
+ ksm_run &= ~KSM_RUN_OFFLINE;
+ mutex_unlock(&ksm_thread_mutex);
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else
+static void wait_while_offlining(void)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+#ifdef CONFIG_SYSFS
+/*
+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
+ */
+
+#define KSM_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define KSM_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
+}
+
+static ssize_t sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_sleep_millisecs = msecs;
+ wake_up_interruptible(&ksm_iter_wait);
+
+ return count;
+}
+KSM_ATTR(sleep_millisecs);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
+}
+
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long nr_pages;
+
+ err = kstrtoul(buf, 10, &nr_pages);
+ if (err || nr_pages > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_pages_to_scan = nr_pages;
+
+ return count;
+}
+KSM_ATTR(pages_to_scan);
+
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_run);
+}
+
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long flags;
+
+ err = kstrtoul(buf, 10, &flags);
+ if (err || flags > UINT_MAX)
+ return -EINVAL;
+ if (flags > KSM_RUN_UNMERGE)
+ return -EINVAL;
+
+ /*
+ * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
+ * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
+ * breaking COW to free the pages_shared (but leaves mm_slots
+ * on the list for when ksmd may be set running again).
+ */
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_run != flags) {
+ ksm_run = flags;
+ if (flags & KSM_RUN_UNMERGE) {
+ set_current_oom_origin();
+ err = unmerge_and_remove_all_rmap_items();
+ clear_current_oom_origin();
+ if (err) {
+ ksm_run = KSM_RUN_STOP;
+ count = err;
+ }
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ if (flags & KSM_RUN_MERGE)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return count;
+}
+KSM_ATTR(run);
+
+#ifdef CONFIG_NUMA
+static ssize_t merge_across_nodes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+}
+
+static ssize_t merge_across_nodes_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long knob;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_merge_across_nodes != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else if (root_stable_tree == one_stable_tree) {
+ struct rb_root *buf;
+ /*
+ * This is the first time that we switch away from the
+ * default of merging across nodes: must now allocate
+ * a buffer to hold as many roots as may be needed.
+ * Allocate stable and unstable together:
+ * MAXSMP NODES_SHIFT 10 will use 16kB.
+ */
+ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
+ GFP_KERNEL);
+ /* Let us assume that RB_ROOT is NULL is zero */
+ if (!buf)
+ err = -ENOMEM;
+ else {
+ root_stable_tree = buf;
+ root_unstable_tree = buf + nr_node_ids;
+ /* Stable tree is empty but not the unstable */
+ root_unstable_tree[0] = one_unstable_tree[0];
+ }
+ }
+ if (!err) {
+ ksm_merge_across_nodes = knob;
+ ksm_nr_node_ids = knob ? 1 : nr_node_ids;
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(merge_across_nodes);
+#endif
+
+static ssize_t use_zero_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_use_zero_pages);
+}
+static ssize_t use_zero_pages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ bool value;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return -EINVAL;
+
+ ksm_use_zero_pages = value;
+
+ return count;
+}
+KSM_ATTR(use_zero_pages);
+
+static ssize_t max_page_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_max_page_sharing);
+}
+
+static ssize_t max_page_sharing_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ int knob;
+
+ err = kstrtoint(buf, 10, &knob);
+ if (err)
+ return err;
+ /*
+ * When a KSM page is created it is shared by 2 mappings. This
+ * being a signed comparison, it implicitly verifies it's not
+ * negative.
+ */
+ if (knob < 2)
+ return -EINVAL;
+
+ if (READ_ONCE(ksm_max_page_sharing) == knob)
+ return count;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_max_page_sharing != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else
+ ksm_max_page_sharing = knob;
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(max_page_sharing);
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_shared);
+}
+KSM_ATTR_RO(pages_shared);
+
+static ssize_t pages_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_sharing);
+}
+KSM_ATTR_RO(pages_sharing);
+
+static ssize_t pages_unshared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_unshared);
+}
+KSM_ATTR_RO(pages_unshared);
+
+static ssize_t pages_volatile_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ long ksm_pages_volatile;
+
+ ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
+ - ksm_pages_sharing - ksm_pages_unshared;
+ /*
+ * It was not worth any locking to calculate that statistic,
+ * but it might therefore sometimes be negative: conceal that.
+ */
+ if (ksm_pages_volatile < 0)
+ ksm_pages_volatile = 0;
+ return sprintf(buf, "%ld\n", ksm_pages_volatile);
+}
+KSM_ATTR_RO(pages_volatile);
+
+static ssize_t stable_node_dups_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+}
+KSM_ATTR_RO(stable_node_dups);
+
+static ssize_t stable_node_chains_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+}
+KSM_ATTR_RO(stable_node_chains);
+
+static ssize_t
+stable_node_chains_prune_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+}
+
+static ssize_t
+stable_node_chains_prune_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_stable_node_chains_prune_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(stable_node_chains_prune_millisecs);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+}
+KSM_ATTR_RO(full_scans);
+
+static struct attribute *ksm_attrs[] = {
+ &sleep_millisecs_attr.attr,
+ &pages_to_scan_attr.attr,
+ &run_attr.attr,
+ &pages_shared_attr.attr,
+ &pages_sharing_attr.attr,
+ &pages_unshared_attr.attr,
+ &pages_volatile_attr.attr,
+ &full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+ &merge_across_nodes_attr.attr,
+#endif
+ &max_page_sharing_attr.attr,
+ &stable_node_chains_attr.attr,
+ &stable_node_dups_attr.attr,
+ &stable_node_chains_prune_millisecs_attr.attr,
+ &use_zero_pages_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group ksm_attr_group = {
+ .attrs = ksm_attrs,
+ .name = "ksm",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init ksm_init(void)
+{
+ struct task_struct *ksm_thread;
+ int err;
+
+ /* The correct value depends on page size and endianness */
+ zero_checksum = calc_checksum(ZERO_PAGE(0));
+ /* Default to false for backwards compatibility */
+ ksm_use_zero_pages = false;
+
+ err = ksm_slab_init();
+ if (err)
+ goto out;
+
+ ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
+ if (IS_ERR(ksm_thread)) {
+ pr_err("ksm: creating kthread failed\n");
+ err = PTR_ERR(ksm_thread);
+ goto out_free;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = sysfs_create_group(mm_kobj, &ksm_attr_group);
+ if (err) {
+ pr_err("ksm: register sysfs failed\n");
+ kthread_stop(ksm_thread);
+ goto out_free;
+ }
+#else
+ ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
+
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ /* There is no significance to this priority 100 */
+ hotplug_memory_notifier(ksm_memory_callback, 100);
+#endif
+ return 0;
+
+out_free:
+ ksm_slab_free();
+out:
+ return err;
+}
+subsys_initcall(ksm_init);
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 000000000..fe2300816
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,649 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/list_lru.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+#include "slab.h"
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_add(&lru->list, &list_lrus);
+ mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_del(&lru->list);
+ mutex_unlock(&list_lrus_mutex);
+}
+
+static int lru_shrinker_id(struct list_lru *lru)
+{
+ return lru->shrinker_id;
+}
+
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return lru->memcg_aware;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ struct list_lru_memcg *memcg_lrus;
+ /*
+ * Either lock or RCU protects the array of per cgroup lists
+ * from relocation (see memcg_update_list_lru_node).
+ */
+ memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
+ lockdep_is_held(&nlru->lock));
+ if (memcg_lrus && idx >= 0)
+ return memcg_lrus->lru[idx];
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
+{
+ struct list_lru_one *l = &nlru->lru;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!nlru->memcg_lrus)
+ goto out;
+
+ memcg = mem_cgroup_from_obj(ptr);
+ if (!memcg)
+ goto out;
+
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+out:
+ if (memcg_ptr)
+ *memcg_ptr = memcg;
+ return l;
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+
+static int lru_shrinker_id(struct list_lru *lru)
+{
+ return -1;
+}
+
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
+{
+ if (memcg_ptr)
+ *memcg_ptr = NULL;
+ return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg;
+ struct list_lru_one *l;
+
+ spin_lock(&nlru->lock);
+ if (list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item, &memcg);
+ list_add_tail(item, &l->list);
+ /* Set shrinker bit if the first element was added */
+ if (!l->nr_items++)
+ memcg_set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
+ nlru->nr_items++;
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+
+ spin_lock(&nlru->lock);
+ if (!list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item, NULL);
+ list_del_init(item);
+ l->nr_items--;
+ nlru->nr_items--;
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+ list_del_init(item);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+ struct list_head *head)
+{
+ list_move(item, head);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ unsigned long count;
+
+ rcu_read_lock();
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ count = READ_ONCE(l->nr_items);
+ rcu_read_unlock();
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+ struct list_lru_node *nlru;
+
+ nlru = &lru->node[nid];
+ return nlru->nr_items;
+}
+EXPORT_SYMBOL_GPL(list_lru_count_node);
+
+static unsigned long
+__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+
+ struct list_lru_one *l;
+ struct list_head *item, *n;
+ unsigned long isolated = 0;
+
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
+restart:
+ list_for_each_safe(item, n, &l->list) {
+ enum lru_status ret;
+
+ /*
+ * decrement nr_to_walk first so that we don't livelock if we
+ * get stuck on large numbers of LRU_RETRY items
+ */
+ if (!*nr_to_walk)
+ break;
+ --*nr_to_walk;
+
+ ret = isolate(item, l, &nlru->lock, cb_arg);
+ switch (ret) {
+ case LRU_REMOVED_RETRY:
+ assert_spin_locked(&nlru->lock);
+ fallthrough;
+ case LRU_REMOVED:
+ isolated++;
+ nlru->nr_items--;
+ /*
+ * If the lru lock has been dropped, our list
+ * traversal is now invalid and so we have to
+ * restart from scratch.
+ */
+ if (ret == LRU_REMOVED_RETRY)
+ goto restart;
+ break;
+ case LRU_ROTATE:
+ list_move_tail(item, &l->list);
+ break;
+ case LRU_SKIP:
+ break;
+ case LRU_RETRY:
+ /*
+ * The lru lock has been dropped, our list traversal is
+ * now invalid and so we have to restart from scratch.
+ */
+ assert_spin_locked(&nlru->lock);
+ goto restart;
+ default:
+ BUG();
+ }
+ }
+ return isolated;
+}
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long
+list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock_irq(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock_irq(&nlru->lock);
+ return ret;
+}
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ long isolated = 0;
+ int memcg_idx;
+
+ isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
+ nr_to_walk);
+ if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx) {
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+
+ if (*nr_to_walk <= 0)
+ break;
+ }
+ }
+ return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+static void init_one_lru(struct list_lru_one *l)
+{
+ INIT_LIST_HEAD(&l->list);
+ l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++)
+ kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++) {
+ struct list_lru_one *l;
+
+ l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+ if (!l)
+ goto fail;
+
+ init_one_lru(l);
+ memcg_lrus->lru[i] = l;
+ }
+ return 0;
+fail:
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
+ return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+ struct list_lru_memcg *memcg_lrus;
+ int size = memcg_nr_cache_ids;
+
+ memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
+ size * sizeof(void *), GFP_KERNEL);
+ if (!memcg_lrus)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
+ kvfree(memcg_lrus);
+ return -ENOMEM;
+ }
+ RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
+
+ return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+ struct list_lru_memcg *memcg_lrus;
+ /*
+ * This is called when shrinker has already been unregistered,
+ * and nobody can use it. So, there is no need to use kvfree_rcu_local().
+ */
+ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
+ __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
+ kvfree(memcg_lrus);
+}
+
+static void kvfree_rcu_local(struct rcu_head *head)
+{
+ struct list_lru_memcg *mlru;
+
+ mlru = container_of(head, struct list_lru_memcg, rcu);
+ kvfree(mlru);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ struct list_lru_memcg *old, *new;
+
+ BUG_ON(old_size > new_size);
+
+ old = rcu_dereference_protected(nlru->memcg_lrus,
+ lockdep_is_held(&list_lrus_mutex));
+ new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+ kvfree(new);
+ return -ENOMEM;
+ }
+
+ memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
+
+ /*
+ * The locking below allows readers that hold nlru->lock avoid taking
+ * rcu_read_lock (see list_lru_from_memcg_idx).
+ *
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+ rcu_assign_pointer(nlru->memcg_lrus, new);
+ spin_unlock_irq(&nlru->lock);
+
+ call_rcu(&old->rcu, kvfree_rcu_local);
+ return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ struct list_lru_memcg *memcg_lrus;
+
+ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
+ lockdep_is_held(&list_lrus_mutex));
+ /* do not bother shrinking the array back to the old size, because we
+ * cannot handle allocation failures here */
+ __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ int i;
+
+ lru->memcg_aware = memcg_aware;
+
+ if (!memcg_aware)
+ return 0;
+
+ for_each_node(i) {
+ if (memcg_init_list_lru_node(&lru->node[i]))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
+ memcg_destroy_list_lru_node(&lru->node[i]);
+ }
+ return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for_each_node(i)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return 0;
+
+ for_each_node(i) {
+ if (memcg_update_list_lru_node(&lru->node[i],
+ old_size, new_size))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--) {
+ if (!lru->node[i].memcg_lrus)
+ continue;
+
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+ }
+ return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for_each_node(i)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+ int ret = 0;
+ struct list_lru *lru;
+ int old_size = memcg_nr_cache_ids;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list) {
+ ret = memcg_update_list_lru(lru, old_size, new_size);
+ if (ret)
+ goto fail;
+ }
+out:
+ mutex_unlock(&list_lrus_mutex);
+ return ret;
+fail:
+ list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+ memcg_cancel_update_list_lru(lru, old_size, new_size);
+ goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ int dst_idx = dst_memcg->kmemcg_id;
+ struct list_lru_one *src, *dst;
+
+ /*
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+
+ src = list_lru_from_memcg_idx(nlru, src_idx);
+ dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+ list_splice_init(&src->list, &dst->list);
+
+ if (src->nr_items) {
+ dst->nr_items += src->nr_items;
+ memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+ src->nr_items = 0;
+ }
+
+ spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+ int src_idx, struct mem_cgroup *dst_memcg)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for_each_node(i)
+ memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
+{
+ struct list_lru *lru;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list)
+ memcg_drain_list_lru(lru, src_idx, dst_memcg);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key, struct shrinker *shrinker)
+{
+ int i;
+ int err = -ENOMEM;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker)
+ lru->shrinker_id = shrinker->id;
+ else
+ lru->shrinker_id = -1;
+#endif
+ memcg_get_cache_ids();
+
+ lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
+ if (!lru->node)
+ goto out;
+
+ for_each_node(i) {
+ spin_lock_init(&lru->node[i].lock);
+ if (key)
+ lockdep_set_class(&lru->node[i].lock, key);
+ init_one_lru(&lru->node[i].lru);
+ }
+
+ err = memcg_init_list_lru(lru, memcg_aware);
+ if (err) {
+ kfree(lru->node);
+ /* Do this so a list_lru_destroy() doesn't crash: */
+ lru->node = NULL;
+ goto out;
+ }
+
+ list_lru_register(lru);
+out:
+ memcg_put_cache_ids();
+ return err;
+}
+EXPORT_SYMBOL_GPL(__list_lru_init);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+ /* Already destroyed or not yet initialized? */
+ if (!lru->node)
+ return;
+
+ memcg_get_cache_ids();
+
+ list_lru_unregister(lru);
+
+ memcg_destroy_list_lru(lru);
+ kfree(lru->node);
+ lru->node = NULL;
+
+#ifdef CONFIG_MEMCG_KMEM
+ lru->shrinker_id = -1;
+#endif
+ memcg_put_cache_ids();
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 000000000..f6ea117a6
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Access kernel or user memory without faulting.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
+ size_t size)
+{
+ return true;
+}
+
+#ifdef HAVE_GET_KERNEL_NOFAULT
+
+#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __get_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ if (!copy_from_kernel_nofault_allowed(src, size))
+ return -ERANGE;
+
+ pagefault_disable();
+ copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
+ pagefault_enable();
+ return 0;
+Efault:
+ pagefault_enable();
+ return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
+
+#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __put_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ pagefault_disable();
+ copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
+ pagefault_enable();
+ return 0;
+Efault:
+ pagefault_enable();
+ return -EFAULT;
+}
+
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
+{
+ const void *src = unsafe_addr;
+
+ if (unlikely(count <= 0))
+ return 0;
+ if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
+ return -ERANGE;
+
+ pagefault_disable();
+ do {
+ __get_kernel_nofault(dst, src, u8, Efault);
+ dst++;
+ src++;
+ } while (dst[-1] && src - unsafe_addr < count);
+ pagefault_enable();
+
+ dst[-1] = '\0';
+ return src - unsafe_addr;
+Efault:
+ pagefault_enable();
+ dst[0] = '\0';
+ return -EFAULT;
+}
+#else /* HAVE_GET_KERNEL_NOFAULT */
+/**
+ * copy_from_kernel_nofault(): safely attempt to read from kernel-space
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from kernel address @src to the buffer at @dst. If a kernel
+ * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
+ * address, return -ERANGE.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_lock. This makes
+ * copy_from_kernel_nofault() suitable for use within regions where the caller
+ * already holds mmap_lock, or other locks which nest inside mmap_lock.
+ */
+long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ if (!copy_from_kernel_nofault_allowed(src, size))
+ return -ERANGE;
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
+ size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
+
+/**
+ * copy_to_kernel_nofault(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied and the
+ * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
+ * -ERANGE.
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
+{
+ mm_segment_t old_fs = get_fs();
+ const void *src = unsafe_addr;
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+ if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
+ return -ERANGE;
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+
+ do {
+ ret = __get_user(*dst++, (const char __user __force *)src++);
+ } while (dst[-1] && ret == 0 && src - unsafe_addr < count);
+
+ dst[-1] = '\0';
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : src - unsafe_addr;
+}
+#endif /* HAVE_GET_KERNEL_NOFAULT */
+
+/**
+ * copy_from_user_nofault(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = force_uaccess_begin();
+
+ if (access_ok(src, size)) {
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ force_uaccess_end(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nofault);
+
+/**
+ * copy_to_user_nofault(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = force_uaccess_begin();
+
+ if (access_ok(dst, size)) {
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ force_uaccess_end(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_to_user_nofault);
+
+/**
+ * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
+ long count)
+{
+ mm_segment_t old_fs;
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+
+ old_fs = force_uaccess_begin();
+ pagefault_disable();
+ ret = strncpy_from_user(dst, unsafe_addr, count);
+ pagefault_enable();
+ force_uaccess_end(old_fs);
+
+ if (ret >= count) {
+ ret = count;
+ dst[ret - 1] = '\0';
+ } else if (ret > 0) {
+ ret++;
+ }
+
+ return ret;
+}
+
+/**
+ * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_user_nofault(const void __user *unsafe_addr, long count)
+{
+ mm_segment_t old_fs;
+ int ret;
+
+ old_fs = force_uaccess_begin();
+ pagefault_disable();
+ ret = strnlen_user(unsafe_addr, count);
+ pagefault_enable();
+ force_uaccess_end(old_fs);
+
+ return ret;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
new file mode 100644
index 000000000..f71fc88f0
--- /dev/null
+++ b/mm/madvise.c
@@ -0,0 +1,1247 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/madvise.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/mempolicy.h>
+#include <linux/page-isolation.h>
+#include <linux/page_idle.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/hugetlb.h>
+#include <linux/falloc.h>
+#include <linux/fadvise.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
+#include <linux/ksm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/pagewalk.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+struct madvise_walk_private {
+ struct mmu_gather *tlb;
+ bool pageout;
+};
+
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static int madvise_need_mmap_write(int behavior)
+{
+ switch (behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ case MADV_FREE:
+ return 0;
+ default:
+ /* be safe, default to 1. list exceptions explicitly */
+ return 1;
+ }
+}
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+ unsigned long new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
+ case MADV_DONTDUMP:
+ new_flags |= VM_DONTDUMP;
+ break;
+ case MADV_DODUMP:
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags &= ~VM_DONTDUMP;
+ break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out_convert_errno;
+ break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out_convert_errno;
+ break;
+ }
+
+ if (new_flags == vma->vm_flags) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = __split_vma(mm, vma, start, 1);
+ if (error)
+ goto out_convert_errno;
+ }
+
+ if (end != vma->vm_end) {
+ if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ error = -ENOMEM;
+ goto out;
+ }
+ error = __split_vma(mm, vma, end, 0);
+ if (error)
+ goto out_convert_errno;
+ }
+
+success:
+ /*
+ * vm_flags is protected by the mmap_lock held in write mode.
+ */
+ vma->vm_flags = new_flags;
+
+out_convert_errno:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+out:
+ return error;
+}
+
+#ifdef CONFIG_SWAP
+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte;
+ struct vm_area_struct *vma = walk->private;
+ unsigned long index;
+
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ return 0;
+
+ for (index = start; index != end; index += PAGE_SIZE) {
+ pte_t pte;
+ swp_entry_t entry;
+ struct page *page;
+ spinlock_t *ptl;
+
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ pte = *(orig_pte + ((index - start) / PAGE_SIZE));
+ pte_unmap_unlock(orig_pte, ptl);
+
+ if (pte_present(pte) || pte_none(pte))
+ continue;
+ entry = pte_to_swp_entry(pte);
+ if (unlikely(non_swap_entry(entry)))
+ continue;
+
+ page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+ vma, index, false);
+ if (page)
+ put_page(page);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops swapin_walk_ops = {
+ .pmd_entry = swapin_walk_pmd_entry,
+};
+
+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct address_space *mapping)
+{
+ XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
+ pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
+ struct page *page;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, end_index) {
+ swp_entry_t swap;
+
+ if (!xa_is_value(page))
+ continue;
+ xas_pause(&xas);
+ rcu_read_unlock();
+
+ swap = radix_to_swp_entry(page);
+ page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
+ NULL, 0, false);
+ if (page)
+ put_page(page);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+}
+#endif /* CONFIG_SWAP */
+
+/*
+ * Schedule all required I/O operations. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct file *file = vma->vm_file;
+ loff_t offset;
+
+ *prev = vma;
+#ifdef CONFIG_SWAP
+ if (!file) {
+ walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+ return 0;
+ }
+
+ if (shmem_mapping(file->f_mapping)) {
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+#else
+ if (!file)
+ return -EBADF;
+#endif
+
+ if (IS_DAX(file_inode(file))) {
+ /* no bad return value, but ignore advice */
+ return 0;
+ }
+
+ /*
+ * Filesystem's fadvise may need to take various locks. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_lock.
+ */
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ get_file(file);
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+ vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+ fput(file);
+ mmap_read_lock(mm);
+ return 0;
+}
+
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct madvise_walk_private *private = walk->private;
+ struct mmu_gather *tlb = private->tlb;
+ bool pageout = private->pageout;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page = NULL;
+ LIST_HEAD(page_list);
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+regular_page:
+ if (pmd_trans_unstable(pmd))
+ return 0;
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
+ continue;
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ cond_resched();
+
+ return 0;
+}
+
+static const struct mm_walk_ops cold_walk_ops = {
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
+};
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = false,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = true,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!can_do_pageout(vma))
+ return 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+ pte_t *orig_pte, *pte, ptent;
+ struct page *page;
+ int nr_swap = 0;
+ unsigned long next;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd))
+ if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+ goto next;
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * If pmd isn't transhuge but the page is THP and
+ * is owned by only this process, split it and
+ * deactivate all pages.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ goto out;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto out;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ goto out;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (PageSwapCache(page) || PageDirty(page)) {
+ if (!trylock_page(page))
+ continue;
+ /*
+ * If page is shared with others, we couldn't clear
+ * PG_dirty of the page.
+ */
+ if (page_mapcount(page) != 1) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (PageSwapCache(page) && !try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ if (pte_young(ptent) || pte_dirty(ptent)) {
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+ mark_page_lazyfree(page);
+ }
+out:
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+next:
+ return 0;
+}
+
+static const struct mm_walk_ops madvise_free_walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+};
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
+ struct mmu_gather tlb;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (!vma_is_anonymous(vma))
+ return -EINVAL;
+
+ range.start = max(vma->vm_start, start_addr);
+ if (range.start >= vma->vm_end)
+ return -EINVAL;
+ range.end = min(vma->vm_end, end_addr);
+ if (range.end <= vma->vm_start)
+ return -EINVAL;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ range.start, range.end);
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(&range);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
+
+ return 0;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for shrink_active_list to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * shrink_active_list to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ zap_page_range(vma, start, end - start);
+ return 0;
+}
+
+static long madvise_dontneed_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!userfaultfd_remove(vma, start, end)) {
+ *prev = NULL; /* mmap_lock has been dropped, prev is stale */
+
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
+ if (!vma)
+ return -ENOMEM;
+ if (start < vma->vm_start) {
+ /*
+ * This "vma" under revalidation is the one
+ * with the lowest vma->vm_start where start
+ * is also < vma->vm_end. If start <
+ * vma->vm_start it means an hole materialized
+ * in the user address space within the
+ * virtual range passed to MADV_DONTNEED
+ * or MADV_FREE.
+ */
+ return -ENOMEM;
+ }
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+ if (end > vma->vm_end) {
+ /*
+ * Don't fail if end > vma->vm_end. If the old
+ * vma was splitted while the mmap_lock was
+ * released the effect of the concurrent
+ * operation may not cause madvise() to
+ * have an undefined result. There may be an
+ * adjacent next vma that we'll walk
+ * next. userfaultfd_remove() will generate an
+ * UFFD_EVENT_REMOVE repetition on the
+ * end-vma->vm_end range, but the manager can
+ * handle a repetition fine.
+ */
+ end = vma->vm_end;
+ }
+ VM_WARN_ON(start >= end);
+ }
+
+ if (behavior == MADV_DONTNEED)
+ return madvise_dontneed_single_vma(vma, start, end);
+ else if (behavior == MADV_FREE)
+ return madvise_free_single_vma(vma, start, end);
+ else
+ return -EINVAL;
+}
+
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ loff_t offset;
+ int error;
+ struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
+
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+
+ if (vma->vm_flags & VM_LOCKED)
+ return -EINVAL;
+
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
+ return -EINVAL;
+ }
+
+ if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+ return -EACCES;
+
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+
+ /*
+ * Filesystem's fallocate may need to take i_mutex. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_lock.
+ */
+ get_file(f);
+ if (userfaultfd_remove(vma, start, end)) {
+ /* mmap_lock was not released by userfaultfd_remove() */
+ mmap_read_unlock(mm);
+ }
+ error = vfs_fallocate(f,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - start);
+ fput(f);
+ mmap_read_lock(mm);
+ return error;
+}
+
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Error injection support for memory error handling.
+ */
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ struct zone *zone;
+ unsigned long size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+
+ for (; start < end; start += size) {
+ unsigned long pfn;
+ struct page *page;
+ int ret;
+
+ ret = get_user_pages_fast(start, 1, 0, &page);
+ if (ret != 1)
+ return ret;
+ pfn = page_to_pfn(page);
+
+ /*
+ * When soft offlining hugepages, after migrating the page
+ * we dissolve it, therefore in the second loop "page" will
+ * no longer be a compound page.
+ */
+ size = page_size(compound_head(page));
+
+ if (behavior == MADV_SOFT_OFFLINE) {
+ pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ ret = memory_failure(pfn, MF_COUNT_INCREASED);
+ }
+
+ if (ret)
+ return ret;
+ }
+
+ /* Ensure that all poisoned pages are removed from per-cpu lists */
+ for_each_populated_zone(zone)
+ drain_all_pages(zone);
+
+ return 0;
+}
+#endif
+
+static long
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
+{
+ switch (behavior) {
+ case MADV_REMOVE:
+ return madvise_remove(vma, prev, start, end);
+ case MADV_WILLNEED:
+ return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
+ default:
+ return madvise_behavior(vma, prev, start, end, behavior);
+ }
+}
+
+static bool
+madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_DOFORK:
+ case MADV_DONTFORK:
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ case MADV_FREE:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+#ifdef CONFIG_KSM
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
+ case MADV_DONTDUMP:
+ case MADV_DODUMP:
+ case MADV_WIPEONFORK:
+ case MADV_KEEPONFORK:
+#ifdef CONFIG_MEMORY_FAILURE
+ case MADV_SOFT_OFFLINE:
+ case MADV_HWPOISON:
+#endif
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
+ * MADV_REMOVE - the application wants to free up the given range of
+ * pages and associated backing store.
+ * MADV_DONTFORK - omit this area from child's address space when forking:
+ * typically, to avoid COWing pages pinned by get_user_pages().
+ * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_WIPEONFORK - present the child process with zero-filled memory in this
+ * range after a fork.
+ * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
+ * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
+ * this area with pages of identical content from other such areas.
+ * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages,
+ * or the specified address range includes file, Huge TLB,
+ * MAP_SHARED or VMPFNMAP range.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+{
+ unsigned long end, tmp;
+ struct vm_area_struct *vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+ int write;
+ size_t len;
+ struct blk_plug plug;
+
+ start = untagged_addr(start);
+
+ if (!madvise_behavior_valid(behavior))
+ return error;
+
+ if (!PAGE_ALIGNED(start))
+ return error;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return error;
+
+ end = start + len;
+ if (end < start)
+ return error;
+
+ error = 0;
+ if (end == start)
+ return error;
+
+#ifdef CONFIG_MEMORY_FAILURE
+ if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+ return madvise_inject_error(behavior, start, start + len_in);
+#endif
+
+ write = madvise_need_mmap_write(behavior);
+ if (write) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
+ */
+ vma = find_vma_prev(mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ blk_start_plug(&plug);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = madvise_vma(vma, &prev, start, tmp, behavior);
+ if (error)
+ goto out;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ goto out;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_lock */
+ vma = find_vma(mm, start);
+ }
+out:
+ blk_finish_plug(&plug);
+ if (write)
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+
+ return error;
+}
+
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+{
+ return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ size_t, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV], iovec;
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len;
+ unsigned int f_flags;
+
+ if (flags != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ goto out;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto free_iov;
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (!process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported.
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
+ }
+
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
+
+ ret = (total_len - iov_iter_count(&iter)) ? : ret;
+
+release_mm:
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+free_iov:
+ kfree(iov);
+out:
+ return ret;
+}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
new file mode 100644
index 000000000..2c7d03675
--- /dev/null
+++ b/mm/mapping_dirty_helpers.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct wp_walk - Private struct for pagetable walk callbacks
+ * @range: Range for mmu notifiers
+ * @tlbflush_start: Address of first modified pte
+ * @tlbflush_end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ */
+struct wp_walk {
+ struct mmu_notifier_range range;
+ unsigned long tlbflush_start;
+ unsigned long tlbflush_end;
+ unsigned long total;
+};
+
+/**
+ * wp_pte - Write-protect a pte
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @walk: pagetable walk callback argument
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ */
+static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+ pte_t ptent = *pte;
+
+ if (pte_write(ptent)) {
+ pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+
+ ptent = pte_wrprotect(old_pte);
+ ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
+ wpwalk->total++;
+ wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
+ wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
+ addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+/**
+ * struct clean_walk - Private struct for the clean_record_pte function.
+ * @base: struct wp_walk we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct clean_walk {
+ struct wp_walk base;
+ pgoff_t bitmap_pgoff;
+ unsigned long *bitmap;
+ pgoff_t start;
+ pgoff_t end;
+};
+
+#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base)
+
+/**
+ * clean_record_pte - Clean a pte and record its address space offset in a
+ * bitmap
+ * @pte: Pointer to the pte
+ * @addr: The virtual page address
+ * @walk: pagetable walk callback argument
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ */
+static int clean_record_pte(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+ struct clean_walk *cwalk = to_clean_walk(wpwalk);
+ pte_t ptent = *pte;
+
+ if (pte_dirty(ptent)) {
+ pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
+ walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
+ pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
+
+ ptent = pte_mkclean(old_pte);
+ ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
+
+ wpwalk->total++;
+ wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
+ wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
+ addr + PAGE_SIZE);
+
+ __set_bit(pgoff, cwalk->bitmap);
+ cwalk->start = min(cwalk->start, pgoff);
+ cwalk->end = max(cwalk->end, pgoff + 1);
+ }
+
+ return 0;
+}
+
+/*
+ * wp_clean_pmd_entry - The pagewalk pmd callback.
+ *
+ * Dirty-tracking should take place on the PTE level, so
+ * WARN() if encountering a dirty huge pmd.
+ * Furthermore, never split huge pmds, since that currently
+ * causes dirty info loss. The pagefault handler should do
+ * that if needed.
+ */
+static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ if (!pmd_trans_unstable(&pmdval))
+ return 0;
+
+ if (pmd_none(pmdval)) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ /* Huge pmd, present or migrated */
+ walk->action = ACTION_CONTINUE;
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
+
+ return 0;
+}
+
+/*
+ * wp_clean_pud_entry - The pagewalk pud callback.
+ *
+ * Dirty-tracking should take place on the PTE level, so
+ * WARN() if encountering a dirty huge puds.
+ * Furthermore, never split huge puds, since that currently
+ * causes dirty info loss. The pagefault handler should do
+ * that if needed.
+ */
+static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pud_t pudval = READ_ONCE(*pud);
+
+ if (!pud_trans_unstable(&pudval))
+ return 0;
+
+ if (pud_none(pudval)) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+
+ /* Huge pud */
+ walk->action = ACTION_CONTINUE;
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ WARN_ON(pud_write(pudval) || pud_dirty(pudval));
+
+ return 0;
+}
+
+/*
+ * wp_clean_pre_vma - The pagewalk pre_vma callback.
+ *
+ * The pre_vma callback performs the cache flush, stages the tlb flush
+ * and calls the necessary mmu notifiers.
+ */
+static int wp_clean_pre_vma(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+
+ wpwalk->tlbflush_start = end;
+ wpwalk->tlbflush_end = start;
+
+ mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+ walk->vma, walk->mm, start, end);
+ mmu_notifier_invalidate_range_start(&wpwalk->range);
+ flush_cache_range(walk->vma, start, end);
+
+ /*
+ * We're not using tlb_gather_mmu() since typically
+ * only a small subrange of PTEs are affected, whereas
+ * tlb_gather_mmu() records the full range.
+ */
+ inc_tlb_flush_pending(walk->mm);
+
+ return 0;
+}
+
+/*
+ * wp_clean_post_vma - The pagewalk post_vma callback.
+ *
+ * The post_vma callback performs the tlb flush and calls necessary mmu
+ * notifiers.
+ */
+static void wp_clean_post_vma(struct mm_walk *walk)
+{
+ struct wp_walk *wpwalk = walk->private;
+
+ if (mm_tlb_flush_nested(walk->mm))
+ flush_tlb_range(walk->vma, wpwalk->range.start,
+ wpwalk->range.end);
+ else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start)
+ flush_tlb_range(walk->vma, wpwalk->tlbflush_start,
+ wpwalk->tlbflush_end);
+
+ mmu_notifier_invalidate_range_end(&wpwalk->range);
+ dec_tlb_flush_pending(walk->mm);
+}
+
+/*
+ * wp_clean_test_walk - The pagewalk test_walk callback.
+ *
+ * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas.
+ */
+static int wp_clean_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+
+ /* Skip non-applicable VMAs */
+ if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
+ (VM_SHARED | VM_MAYWRITE))
+ return 1;
+
+ return 0;
+}
+
+static const struct mm_walk_ops clean_walk_ops = {
+ .pte_entry = clean_record_pte,
+ .pmd_entry = wp_clean_pmd_entry,
+ .pud_entry = wp_clean_pud_entry,
+ .test_walk = wp_clean_test_walk,
+ .pre_vma = wp_clean_pre_vma,
+ .post_vma = wp_clean_post_vma
+};
+
+static const struct mm_walk_ops wp_walk_ops = {
+ .pte_entry = wp_pte,
+ .pmd_entry = wp_clean_pmd_entry,
+ .pud_entry = wp_clean_pud_entry,
+ .test_walk = wp_clean_test_walk,
+ .pre_vma = wp_clean_pre_vma,
+ .post_vma = wp_clean_post_vma
+};
+
+/**
+ * wp_shared_mapping_range - Write-protect all ptes in an address space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Note: This function currently skips transhuge page-table entries, since
+ * it's intended for dirty-tracking on the PTE level. It will warn on
+ * encountering transhuge write-enabled entries, though, and can easily be
+ * extended to handle them as well.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long wp_shared_mapping_range(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr)
+{
+ struct wp_walk wpwalk = { .total = 0 };
+
+ i_mmap_lock_read(mapping);
+ WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops,
+ &wpwalk));
+ i_mmap_unlock_read(mapping);
+
+ return wpwalk.total;
+}
+EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
+
+/**
+ * clean_record_shared_mapping_range - Clean and record all ptes in an
+ * address space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap. The guarantees are as follows:
+ * a) All ptes dirty when the function starts executing will end up recorded
+ * in the bitmap.
+ * b) All ptes dirtied after that will either remain dirty, be recorded in the
+ * bitmap or both.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * Note: This function currently skips transhuge page-table entries, since
+ * it's intended for dirty-tracking on the PTE level. It will warn on
+ * encountering transhuge dirty entries, though, and can easily be extended
+ * to handle them as well.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end)
+{
+ bool none_set = (*start >= *end);
+ struct clean_walk cwalk = {
+ .base = { .total = 0 },
+ .bitmap_pgoff = bitmap_pgoff,
+ .bitmap = bitmap,
+ .start = none_set ? nr : *start,
+ .end = none_set ? 0 : *end,
+ };
+
+ i_mmap_lock_read(mapping);
+ WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops,
+ &cwalk.base));
+ i_mmap_unlock_read(mapping);
+
+ *start = cwalk.start;
+ *end = cwalk.end;
+
+ return cwalk.base.total;
+}
+EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range);
diff --git a/mm/memblock.c b/mm/memblock.c
new file mode 100644
index 000000000..f72d53957
--- /dev/null
+++ b/mm/memblock.c
@@ -0,0 +1,2031 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Procedures for maintaining information about logical memory blocks.
+ *
+ * Peter Bergner, IBM Corp. June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/kmemleak.h>
+#include <linux/seq_file.h>
+#include <linux/memblock.h>
+
+#include <asm/sections.h>
+#include <linux/io.h>
+
+#include "internal.h"
+
+#define INIT_MEMBLOCK_REGIONS 128
+#define INIT_PHYSMEM_REGIONS 4
+
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
+/**
+ * DOC: memblock overview
+ *
+ * Memblock is a method of managing memory regions during the early
+ * boot period when the usual kernel memory allocators are not up and
+ * running.
+ *
+ * Memblock views the system memory as collections of contiguous
+ * regions. There are several types of these collections:
+ *
+ * * ``memory`` - describes the physical memory available to the
+ * kernel; this may differ from the actual physical memory installed
+ * in the system, for instance when the memory is restricted with
+ * ``mem=`` command line parameter
+ * * ``reserved`` - describes the regions that were allocated
+ * * ``physmem`` - describes the actual physical memory available during
+ * boot regardless of the possible restrictions and memory hot(un)plug;
+ * the ``physmem`` type is only available on some architectures.
+ *
+ * Each region is represented by struct memblock_region that
+ * defines the region extents, its attributes and NUMA node id on NUMA
+ * systems. Every memory type is described by the struct memblock_type
+ * which contains an array of memory regions along with
+ * the allocator metadata. The "memory" and "reserved" types are nicely
+ * wrapped with struct memblock. This structure is statically
+ * initialized at build time. The region arrays are initially sized to
+ * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
+ * for "reserved". The region array for "physmem" is initially sized to
+ * %INIT_PHYSMEM_REGIONS.
+ * The memblock_allow_resize() enables automatic resizing of the region
+ * arrays during addition of new regions. This feature should be used
+ * with care so that memory allocated for the region array will not
+ * overlap with areas that should be reserved, for example initrd.
+ *
+ * The early architecture setup should tell memblock what the physical
+ * memory layout is by using memblock_add() or memblock_add_node()
+ * functions. The first function does not assign the region to a NUMA
+ * node and it is appropriate for UMA systems. Yet, it is possible to
+ * use it on NUMA systems as well and assign the region to a NUMA node
+ * later in the setup process using memblock_set_node(). The
+ * memblock_add_node() performs such an assignment directly.
+ *
+ * Once memblock is setup the memory can be allocated using one of the
+ * API variants:
+ *
+ * * memblock_phys_alloc*() - these functions return the **physical**
+ * address of the allocated memory
+ * * memblock_alloc*() - these functions return the **virtual** address
+ * of the allocated memory.
+ *
+ * Note, that both API variants use implicit assumptions about allowed
+ * memory ranges and the fallback methods. Consult the documentation
+ * of memblock_alloc_internal() and memblock_alloc_range_nid()
+ * functions for more elaborate description.
+ *
+ * As the system boot progresses, the architecture specific mem_init()
+ * function frees all the memory to the buddy page allocator.
+ *
+ * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
+ * memblock data structures (except "physmem") will be discarded after the
+ * system initialization completes.
+ */
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+unsigned long long max_possible_pfn;
+
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
+#endif
+
+struct memblock memblock __initdata_memblock = {
+ .memory.regions = memblock_memory_init_regions,
+ .memory.cnt = 1, /* empty dummy entry */
+ .memory.max = INIT_MEMBLOCK_REGIONS,
+ .memory.name = "memory",
+
+ .reserved.regions = memblock_reserved_init_regions,
+ .reserved.cnt = 1, /* empty dummy entry */
+ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
+ .reserved.name = "reserved",
+
+ .bottom_up = false,
+ .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
+};
+
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+struct memblock_type physmem = {
+ .regions = memblock_physmem_init_regions,
+ .cnt = 1, /* empty dummy entry */
+ .max = INIT_PHYSMEM_REGIONS,
+ .name = "physmem",
+};
+#endif
+
+/*
+ * keep a pointer to &memblock.memory in the text section to use it in
+ * __next_mem_range() and its helpers.
+ * For architectures that do not keep memblock data after init, this
+ * pointer will be reset to NULL at memblock_discard()
+ */
+static __refdata struct memblock_type *memblock_memory = &memblock.memory;
+
+#define for_each_memblock_type(i, memblock_type, rgn) \
+ for (i = 0, rgn = &memblock_type->regions[0]; \
+ i < memblock_type->cnt; \
+ i++, rgn = &memblock_type->regions[i])
+
+#define memblock_dbg(fmt, ...) \
+ do { \
+ if (memblock_debug) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int memblock_debug __initdata_memblock;
+static bool system_has_some_mirror __initdata_memblock = false;
+static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
+
+static enum memblock_flags __init_memblock choose_memblock_flags(void)
+{
+ return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
+
+/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
+static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
+{
+ return *size = min(*size, PHYS_ADDR_MAX - base);
+}
+
+/*
+ * Address comparison utilities
+ */
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
+
+ memblock_cap_size(base, &size);
+
+ for (i = 0; i < type->cnt; i++)
+ if (memblock_addrs_overlap(base, size, type->regions[i].base,
+ type->regions[i].size))
+ break;
+ return i < type->cnt;
+}
+
+/**
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * Return:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid,
+ enum memblock_flags flags)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ cand = round_up(this_start, align);
+ if (cand < this_end && this_end - cand >= size)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * __memblock_find_range_top_down - find free area utility, in top-down
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
+ *
+ * Utility called from memblock_find_in_range_node(), find free area top-down.
+ *
+ * Return:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid,
+ enum memblock_flags flags)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+ NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ if (this_end < size)
+ continue;
+
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * memblock_find_in_range_node - find free area in given range and node
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
+ *
+ * Find @size free area aligned to @align in the specified range and node.
+ *
+ * Return:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid,
+ enum memblock_flags flags)
+{
+ /* pump up @end */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
+ end == MEMBLOCK_ALLOC_KASAN)
+ end = memblock.current_limit;
+
+ /* avoid allocating the first page */
+ start = max_t(phys_addr_t, start, PAGE_SIZE);
+ end = max(start, end);
+
+ if (memblock_bottom_up())
+ return __memblock_find_range_bottom_up(start, end, size, align,
+ nid, flags);
+ else
+ return __memblock_find_range_top_down(start, end, size, align,
+ nid, flags);
+}
+
+/**
+ * memblock_find_in_range - find free area in given range
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
+ * %MEMBLOCK_ALLOC_ACCESSIBLE
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ *
+ * Find @size free area aligned to @align in the specified range.
+ *
+ * Return:
+ * Found address on success, 0 on failure.
+ */
+phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+ phys_addr_t end, phys_addr_t size,
+ phys_addr_t align)
+{
+ phys_addr_t ret;
+ enum memblock_flags flags = choose_memblock_flags();
+
+again:
+ ret = memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE, flags);
+
+ if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ flags &= ~MEMBLOCK_MIRROR;
+ goto again;
+ }
+
+ return ret;
+}
+
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
+{
+ type->total_size -= type->regions[r].size;
+ memmove(&type->regions[r], &type->regions[r + 1],
+ (type->cnt - (r + 1)) * sizeof(type->regions[r]));
+ type->cnt--;
+
+ /* Special case for empty arrays */
+ if (type->cnt == 0) {
+ WARN_ON(type->total_size != 0);
+ type->cnt = 1;
+ type->regions[0].base = 0;
+ type->regions[0].size = 0;
+ type->regions[0].flags = 0;
+ memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
+ }
+}
+
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
+/**
+ * memblock_discard - discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
+{
+ phys_addr_t addr, size;
+
+ if (memblock.reserved.regions != memblock_reserved_init_regions) {
+ addr = __pa(memblock.reserved.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+ if (memblock_reserved_in_slab)
+ kfree(memblock.reserved.regions);
+ else
+ __memblock_free_late(addr, size);
+ }
+
+ if (memblock.memory.regions != memblock_memory_init_regions) {
+ addr = __pa(memblock.memory.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+ if (memblock_memory_in_slab)
+ kfree(memblock.memory.regions);
+ else
+ __memblock_free_late(addr, size);
+ }
+
+ memblock_memory = NULL;
+}
+#endif
+
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start, @new_area_start + @new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * Return:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+ phys_addr_t new_area_start,
+ phys_addr_t new_area_size)
+{
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_alloc_size, new_alloc_size;
+ phys_addr_t old_size, new_size, addr, new_end;
+ int use_slab = slab_is_available();
+ int *in_slab;
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
+ */
+ if (!memblock_can_resize)
+ return -1;
+
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+ /*
+ * We need to allocated new one align to PAGE_SIZE,
+ * so we can free them completely later.
+ */
+ old_alloc_size = PAGE_ALIGN(old_size);
+ new_alloc_size = PAGE_ALIGN(new_size);
+
+ /* Retrieve the slab flag */
+ if (type == &memblock.memory)
+ in_slab = &memblock_memory_in_slab;
+ else
+ in_slab = &memblock_reserved_in_slab;
+
+ /* Try to find some space for it */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array ? __pa(new_array) : 0;
+ } else {
+ /* only exclude range when trying to double reserved.regions */
+ if (type != &memblock.reserved)
+ new_area_start = new_area_size = 0;
+
+ addr = memblock_find_in_range(new_area_start + new_area_size,
+ memblock.current_limit,
+ new_alloc_size, PAGE_SIZE);
+ if (!addr && new_area_size)
+ addr = memblock_find_in_range(0,
+ min(new_area_start, memblock.current_limit),
+ new_alloc_size, PAGE_SIZE);
+
+ new_array = addr ? __va(addr) : NULL;
+ }
+ if (!addr) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ type->name, type->max, type->max * 2);
+ return -1;
+ }
+
+ new_end = addr + new_size - 1;
+ memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
+ type->name, type->max * 2, &addr, &new_end);
+
+ /*
+ * Found space, we now need to move the array over before we add the
+ * reserved region since it may be our reserved array itself that is
+ * full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* Free old array. We needn't free it if the array is the static one */
+ if (*in_slab)
+ kfree(old_array);
+ else if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_alloc_size);
+
+ /*
+ * Reserve the new array if that comes from the memblock. Otherwise, we
+ * needn't do it
+ */
+ if (!use_slab)
+ BUG_ON(memblock_reserve(addr, new_alloc_size));
+
+ /* Update slab flag */
+ *in_slab = use_slab;
+
+ return 0;
+}
+
+/**
+ * memblock_merge_regions - merge neighboring compatible regions
+ * @type: memblock type to scan
+ *
+ * Scan @type and merge neighboring compatible regions.
+ */
+static void __init_memblock memblock_merge_regions(struct memblock_type *type)
+{
+ int i = 0;
+
+ /* cnt never goes below 1 */
+ while (i < type->cnt - 1) {
+ struct memblock_region *this = &type->regions[i];
+ struct memblock_region *next = &type->regions[i + 1];
+
+ if (this->base + this->size != next->base ||
+ memblock_get_region_node(this) !=
+ memblock_get_region_node(next) ||
+ this->flags != next->flags) {
+ BUG_ON(this->base + this->size > next->base);
+ i++;
+ continue;
+ }
+
+ this->size += next->size;
+ /* move forward from next + 1, index of which is i + 2 */
+ memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
+ type->cnt--;
+ }
+}
+
+/**
+ * memblock_insert_region - insert new memblock region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: node id of the new region
+ * @flags: flags of the new region
+ *
+ * Insert new memblock region [@base, @base + @size) into @type at @idx.
+ * @type must already have extra room to accommodate the new region.
+ */
+static void __init_memblock memblock_insert_region(struct memblock_type *type,
+ int idx, phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ enum memblock_flags flags)
+{
+ struct memblock_region *rgn = &type->regions[idx];
+
+ BUG_ON(type->cnt >= type->max);
+ memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
+ rgn->base = base;
+ rgn->size = size;
+ rgn->flags = flags;
+ memblock_set_region_node(rgn, nid);
+ type->cnt++;
+ type->total_size += size;
+}
+
+/**
+ * memblock_add_range - add new memblock region
+ * @type: memblock type to add new region into
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ * @flags: flags of the new region
+ *
+ * Add new memblock region [@base, @base + @size) into @type. The new region
+ * is allowed to overlap with existing ones - overlaps don't affect already
+ * existing regions. @type is guaranteed to be minimal (all neighbouring
+ * compatible regions are merged) after the addition.
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_add_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int nid, enum memblock_flags flags)
+{
+ bool insert = false;
+ phys_addr_t obase = base;
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+ int idx, nr_new;
+ struct memblock_region *rgn;
+
+ if (!size)
+ return 0;
+
+ /* special case for empty array */
+ if (type->regions[0].size == 0) {
+ WARN_ON(type->cnt != 1 || type->total_size);
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ type->regions[0].flags = flags;
+ memblock_set_region_node(&type->regions[0], nid);
+ type->total_size = size;
+ return 0;
+ }
+repeat:
+ /*
+ * The following is executed twice. Once with %false @insert and
+ * then with %true. The first counts the number of regions needed
+ * to accommodate the new area. The second actually inserts them.
+ */
+ base = obase;
+ nr_new = 0;
+
+ for_each_memblock_type(idx, type, rgn) {
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
+
+ if (rbase >= end)
+ break;
+ if (rend <= base)
+ continue;
+ /*
+ * @rgn overlaps. If it separates the lower part of new
+ * area, insert that portion.
+ */
+ if (rbase > base) {
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
+ WARN_ON(flags != rgn->flags);
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, idx++, base,
+ rbase - base, nid,
+ flags);
+ }
+ /* area below @rend is dealt with, forget about it */
+ base = min(rend, end);
+ }
+
+ /* insert the remaining portion */
+ if (base < end) {
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, idx, base, end - base,
+ nid, flags);
+ }
+
+ if (!nr_new)
+ return 0;
+
+ /*
+ * If this was the first round, resize array and repeat for actual
+ * insertions; otherwise, merge and return.
+ */
+ if (!insert) {
+ while (type->cnt + nr_new > type->max)
+ if (memblock_double_array(type, obase, size) < 0)
+ return -ENOMEM;
+ insert = true;
+ goto repeat;
+ } else {
+ memblock_merge_regions(type);
+ return 0;
+ }
+}
+
+/**
+ * memblock_add_node - add new memblock region within a NUMA node
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
+ int nid)
+{
+ return memblock_add_range(&memblock.memory, base, size, nid, 0);
+}
+
+/**
+ * memblock_add - add new memblock region
+ * @base: base address of the new region
+ * @size: size of the new region
+ *
+ * Add new memblock region [@base, @base + @size) to the "memory"
+ * type. See memblock_add_range() description for mode details
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_isolate_range - isolate given range into disjoint memblocks
+ * @type: memblock type to isolate range for
+ * @base: base of range to isolate
+ * @size: size of range to isolate
+ * @start_rgn: out parameter for the start of isolated region
+ * @end_rgn: out parameter for the end of isolated region
+ *
+ * Walk @type and ensure that regions don't cross the boundaries defined by
+ * [@base, @base + @size). Crossing regions are split at the boundaries,
+ * which may create at most two more regions. The index of the first
+ * region inside the range is returned in *@start_rgn and end in *@end_rgn.
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_isolate_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int *start_rgn, int *end_rgn)
+{
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+ int idx;
+ struct memblock_region *rgn;
+
+ *start_rgn = *end_rgn = 0;
+
+ if (!size)
+ return 0;
+
+ /* we'll create at most two more regions */
+ while (type->cnt + 2 > type->max)
+ if (memblock_double_array(type, base, size) < 0)
+ return -ENOMEM;
+
+ for_each_memblock_type(idx, type, rgn) {
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
+
+ if (rbase >= end)
+ break;
+ if (rend <= base)
+ continue;
+
+ if (rbase < base) {
+ /*
+ * @rgn intersects from below. Split and continue
+ * to process the next region - the new top half.
+ */
+ rgn->base = base;
+ rgn->size -= base - rbase;
+ type->total_size -= base - rbase;
+ memblock_insert_region(type, idx, rbase, base - rbase,
+ memblock_get_region_node(rgn),
+ rgn->flags);
+ } else if (rend > end) {
+ /*
+ * @rgn intersects from above. Split and redo the
+ * current region - the new bottom half.
+ */
+ rgn->base = end;
+ rgn->size -= end - rbase;
+ type->total_size -= end - rbase;
+ memblock_insert_region(type, idx--, rbase, end - rbase,
+ memblock_get_region_node(rgn),
+ rgn->flags);
+ } else {
+ /* @rgn is fully contained, record it */
+ if (!*end_rgn)
+ *start_rgn = idx;
+ *end_rgn = idx + 1;
+ }
+ }
+
+ return 0;
+}
+
+static int __init_memblock memblock_remove_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = end_rgn - 1; i >= start_rgn; i--)
+ memblock_remove_region(type, i);
+ return 0;
+}
+
+int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_remove_range(&memblock.memory, base, size);
+}
+
+/**
+ * memblock_free - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ kmemleak_free_part_phys(base, size);
+ return memblock_remove_range(&memblock.reserved, base, size);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+}
+
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
+}
+#endif
+
+/**
+ * memblock_setclr_flag - set or clear flag for a memory region
+ * @base: base address of the region
+ * @size: size of the region
+ * @set: set or clear the flag
+ * @flag: the flag to udpate
+ *
+ * This function isolates region [@base, @base + @size), and sets/clears flag
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_setclr_flag(phys_addr_t base,
+ phys_addr_t size, int set, int flag)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++) {
+ struct memblock_region *r = &type->regions[i];
+
+ if (set)
+ r->flags |= flag;
+ else
+ r->flags &= ~flag;
+ }
+
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG);
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
+}
+
+/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+ system_has_some_mirror = true;
+
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+/**
+ * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
+}
+
+/**
+ * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
+}
+
+static bool should_skip_region(struct memblock_type *type,
+ struct memblock_region *m,
+ int nid, int flags)
+{
+ int m_nid = memblock_get_region_node(m);
+
+ /* we never skip regions when iterating memblock.reserved or physmem */
+ if (type != memblock_memory)
+ return false;
+
+ /* only memory regions are associated with nodes, check it */
+ if (nid != NUMA_NO_NODE && nid != m_nid)
+ return true;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
+ !(flags & MEMBLOCK_HOTPLUG))
+ return true;
+
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ return true;
+
+ /* skip nomap memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ return true;
+
+ return false;
+}
+
+/**
+ * __next_mem_range - next function for for_each_free_mem_range() etc.
+ * @idx: pointer to u64 loop variable
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Find the first area from *@idx which matches @nid, fill the out
+ * parameters, and update *@idx for the next iteration. The lower 32bit of
+ * *@idx contains index into type_a and the upper 32bit indexes the
+ * areas before each region in type_b. For example, if type_b regions
+ * look like the following,
+ *
+ * 0:[0-16), 1:[32-48), 2:[128-130)
+ *
+ * The upper 32bit indexes the following regions.
+ *
+ * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
+ *
+ * As both region arrays are sorted, the function advances the two indices
+ * in lockstep and returns each intersection.
+ */
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b, phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES,
+ "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ for (; idx_a < type_a->cnt; idx_a++) {
+ struct memblock_region *m = &type_a->regions[idx_a];
+
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
+
+ if (should_skip_region(type_a, m, nid, flags))
+ continue;
+
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+
+ /* scan areas before each reservation */
+ for (; idx_b < type_b->cnt + 1; idx_b++) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : PHYS_ADDR_MAX;
+
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
+ if (r_start >= m_end)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_start < r_end) {
+ if (out_start)
+ *out_start =
+ max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = m_nid;
+ /*
+ * The region which ends first is
+ * advanced for the next iteration.
+ */
+ if (m_end <= r_end)
+ idx_a++;
+ else
+ idx_b++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+ }
+ }
+
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+/**
+ * __next_mem_range_rev - generic next function for for_each_*_range_rev()
+ *
+ * @idx: pointer to u64 loop variable
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
+ * Reverse of __next_mem_range().
+ */
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+ enum memblock_flags flags,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ if (*idx == (u64)ULLONG_MAX) {
+ idx_a = type_a->cnt - 1;
+ if (type_b != NULL)
+ idx_b = type_b->cnt;
+ else
+ idx_b = 0;
+ }
+
+ for (; idx_a >= 0; idx_a--) {
+ struct memblock_region *m = &type_a->regions[idx_a];
+
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
+
+ if (should_skip_region(type_a, m, nid, flags))
+ continue;
+
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a--;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+
+ /* scan areas before each reservation */
+ for (; idx_b >= 0; idx_b--) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : PHYS_ADDR_MAX;
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
+
+ if (r_end <= m_start)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_end > r_start) {
+ if (out_start)
+ *out_start = max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = m_nid;
+ if (m_start >= r_start)
+ idx_a--;
+ else
+ idx_b--;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+ }
+ }
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+/*
+ * Common iterator interface used to define for_each_mem_pfn_range().
+ */
+void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+ unsigned long *out_start_pfn,
+ unsigned long *out_end_pfn, int *out_nid)
+{
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *r;
+ int r_nid;
+
+ while (++*idx < type->cnt) {
+ r = &type->regions[*idx];
+ r_nid = memblock_get_region_node(r);
+
+ if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
+ continue;
+ if (nid == MAX_NUMNODES || nid == r_nid)
+ break;
+ }
+ if (*idx >= type->cnt) {
+ *idx = -1;
+ return;
+ }
+
+ if (out_start_pfn)
+ *out_start_pfn = PFN_UP(r->base);
+ if (out_end_pfn)
+ *out_end_pfn = PFN_DOWN(r->base + r->size);
+ if (out_nid)
+ *out_nid = r_nid;
+}
+
+/**
+ * memblock_set_node - set node ID on memblock regions
+ * @base: base of area to set node ID for
+ * @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
+ * @nid: node ID to set
+ *
+ * Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
+ * Regions which cross the area boundaries are split as necessary.
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
+ struct memblock_type *type, int nid)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_node(&type->regions[i], nid);
+
+ memblock_merge_regions(type);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/**
+ * memblock_alloc_range_nid - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (phys address)
+ * @end: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
+ *
+ * If the specified node can not hold the requested memory and @exact_nid
+ * is false, the allocation falls back to any node in the system.
+ *
+ * For systems with memory mirroring, the allocation is attempted first
+ * from the regions with mirroring enabled and then retried from any
+ * memory region.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * Return:
+ * Physical address of allocated memory block on success, %0 on failure.
+ */
+phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid,
+ bool exact_nid)
+{
+ enum memblock_flags flags = choose_memblock_flags();
+ phys_addr_t found;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ if (!align) {
+ /* Can't use WARNs this early in boot on powerpc */
+ dump_stack();
+ align = SMP_CACHE_BYTES;
+ }
+
+again:
+ found = memblock_find_in_range_node(size, align, start, end, nid,
+ flags);
+ if (found && !memblock_reserve(found, size))
+ goto done;
+
+ if (nid != NUMA_NO_NODE && !exact_nid) {
+ found = memblock_find_in_range_node(size, align, start,
+ end, NUMA_NO_NODE,
+ flags);
+ if (found && !memblock_reserve(found, size))
+ goto done;
+ }
+
+ if (flags & MEMBLOCK_MIRROR) {
+ flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ goto again;
+ }
+
+ return 0;
+
+done:
+ /* Skip kmemleak for kasan_init() due to high volume. */
+ if (end != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * The min_count is set to 0 so that memblock allocated
+ * blocks are never reported as leaks. This is because many
+ * of these blocks are only referred via the physical
+ * address which is not looked up by kmemleak.
+ */
+ kmemleak_alloc_phys(found, size, 0, 0);
+
+ return found;
+}
+
+/**
+ * memblock_phys_alloc_range - allocate a memory block inside specified range
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (physical address)
+ * @end: the upper bound of the memory region to allocate (physical address)
+ *
+ * Allocate @size bytes in the between @start and @end.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
+phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
+ phys_addr_t align,
+ phys_addr_t start,
+ phys_addr_t end)
+{
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ false);
+}
+
+/**
+ * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Allocates memory block from the specified NUMA node. If the node
+ * has no available memory, attempts to allocated from any node in the
+ * system.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
+phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ return memblock_alloc_range_nid(size, align, 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
+}
+
+/**
+ * memblock_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @exact_nid: control the allocation fall back to other nodes
+ *
+ * Allocates memory block using memblock_alloc_range_nid() and
+ * converts the returned physical address to virtual.
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Other constraints, such
+ * as node and mirrored memory will be handled again in
+ * memblock_alloc_range_nid().
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid, bool exact_nid)
+{
+ phys_addr_t alloc;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of memblock_free_all)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
+ alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
+ exact_nid);
+
+ /* retry allocation without lower limit */
+ if (!alloc && min_addr)
+ alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
+ exact_nid);
+
+ if (!alloc)
+ return NULL;
+
+ return phys_to_virt(alloc);
+}
+
+/**
+ * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
+ * without zeroing memory
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_alloc_exact_nid_raw(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
+
+ ptr = memblock_alloc_internal(size, align,
+ min_addr, max_addr, nid, true);
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+/**
+ * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_alloc_try_nid_raw(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
+
+ ptr = memblock_alloc_internal(size, align,
+ min_addr, max_addr, nid, false);
+ if (ptr && size > 0)
+ page_init_poison(ptr, size);
+
+ return ptr;
+}
+
+/**
+ * memblock_alloc_try_nid - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
+ ptr = memblock_alloc_internal(size, align,
+ min_addr, max_addr, nid, false);
+ if (ptr)
+ memset(ptr, 0, size);
+
+ return ptr;
+}
+
+/**
+ * __memblock_free_late - free pages directly to buddy allocator
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the memblock allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t cursor, end;
+
+ end = base + size - 1;
+ memblock_dbg("%s: [%pa-%pa] %pS\n",
+ __func__, &base, &end, (void *)_RET_IP_);
+ kmemleak_free_part_phys(base, size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ memblock_free_pages(pfn_to_page(cursor), cursor, 0);
+ totalram_pages_inc();
+ }
+}
+
+/*
+ * Remaining API functions
+ */
+
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
+{
+ return memblock.memory.total_size;
+}
+
+phys_addr_t __init_memblock memblock_reserved_size(void)
+{
+ return memblock.reserved.total_size;
+}
+
+/* lowest address */
+phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+{
+ return memblock.memory.regions[0].base;
+}
+
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
+{
+ int idx = memblock.memory.cnt - 1;
+
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
+}
+
+static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
+{
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
+ struct memblock_region *r;
+
+ /*
+ * translate the memory @limit size into the max address within one of
+ * the memory memblock regions, if the @limit exceeds the total size
+ * of those regions, max_addr will keep original value PHYS_ADDR_MAX
+ */
+ for_each_mem_region(r) {
+ if (limit <= r->size) {
+ max_addr = r->base + limit;
+ break;
+ }
+ limit -= r->size;
+ }
+
+ return max_addr;
+}
+
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
+{
+ phys_addr_t max_addr;
+
+ if (!limit)
+ return;
+
+ max_addr = __find_max_addr(limit);
+
+ /* @limit exceeds the total size of the memory, do nothing */
+ if (max_addr == PHYS_ADDR_MAX)
+ return;
+
+ /* truncate both memory and reserved regions */
+ memblock_remove_range(&memblock.memory, max_addr,
+ PHYS_ADDR_MAX);
+ memblock_remove_range(&memblock.reserved, max_addr,
+ PHYS_ADDR_MAX);
+}
+
+void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ if (!size)
+ return;
+
+ ret = memblock_isolate_range(&memblock.memory, base, size,
+ &start_rgn, &end_rgn);
+ if (ret)
+ return;
+
+ /* remove all the MAP regions */
+ for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
+ if (!memblock_is_nomap(&memblock.memory.regions[i]))
+ memblock_remove_region(&memblock.memory, i);
+
+ for (i = start_rgn - 1; i >= 0; i--)
+ if (!memblock_is_nomap(&memblock.memory.regions[i]))
+ memblock_remove_region(&memblock.memory, i);
+
+ /* truncate the reserved regions */
+ memblock_remove_range(&memblock.reserved, 0, base);
+ memblock_remove_range(&memblock.reserved,
+ base + size, PHYS_ADDR_MAX);
+}
+
+void __init memblock_mem_limit_remove_map(phys_addr_t limit)
+{
+ phys_addr_t max_addr;
+
+ if (!limit)
+ return;
+
+ max_addr = __find_max_addr(limit);
+
+ /* @limit exceeds the total size of the memory, do nothing */
+ if (max_addr == PHYS_ADDR_MAX)
+ return;
+
+ memblock_cap_memory_range(0, max_addr);
+}
+
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+bool __init_memblock memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+bool __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
+{
+ int i = memblock_search(&memblock.memory, addr);
+
+ if (i == -1)
+ return false;
+ return !memblock_is_nomap(&memblock.memory.regions[i]);
+}
+
+int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ int mid = memblock_search(type, PFN_PHYS(pfn));
+
+ if (mid == -1)
+ return -1;
+
+ *start_pfn = PFN_DOWN(type->regions[mid].base);
+ *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
+
+ return memblock_get_region_node(&type->regions[mid]);
+}
+
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base + @size) is a subset of a memory block.
+ *
+ * Return:
+ * 0 if false, non-zero if true
+ */
+bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.memory, base);
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+
+ if (idx == -1)
+ return false;
+ return (memblock.memory.regions[idx].base +
+ memblock.memory.regions[idx].size) >= end;
+}
+
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base + @size) intersects a reserved
+ * memory block.
+ *
+ * Return:
+ * True if they intersect, false if not.
+ */
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size);
+}
+
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+ phys_addr_t start, end, orig_start, orig_end;
+ struct memblock_region *r;
+
+ for_each_mem_region(r) {
+ orig_start = r->base;
+ orig_end = r->base + r->size;
+ start = round_up(orig_start, align);
+ end = round_down(orig_end, align);
+
+ if (start == orig_start && end == orig_end)
+ continue;
+
+ if (start < end) {
+ r->base = start;
+ r->size = end - start;
+ } else {
+ memblock_remove_region(&memblock.memory,
+ r - memblock.memory.regions);
+ r--;
+ }
+ }
+}
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
+{
+ memblock.current_limit = limit;
+}
+
+phys_addr_t __init_memblock memblock_get_current_limit(void)
+{
+ return memblock.current_limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *type)
+{
+ phys_addr_t base, end, size;
+ enum memblock_flags flags;
+ int idx;
+ struct memblock_region *rgn;
+
+ pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
+
+ for_each_memblock_type(idx, type, rgn) {
+ char nid_buf[32] = "";
+
+ base = rgn->base;
+ size = rgn->size;
+ end = base + size - 1;
+ flags = rgn->flags;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+ snprintf(nid_buf, sizeof(nid_buf), " on node %d",
+ memblock_get_region_node(rgn));
+#endif
+ pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
+ type->name, idx, &base, &end, &size, nid_buf, flags);
+ }
+}
+
+static void __init_memblock __memblock_dump_all(void)
+{
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = %pa reserved size = %pa\n",
+ &memblock.memory.total_size,
+ &memblock.reserved.total_size);
+
+ memblock_dump(&memblock.memory);
+ memblock_dump(&memblock.reserved);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ memblock_dump(&physmem);
+#endif
+}
+
+void __init_memblock memblock_dump_all(void)
+{
+ if (memblock_debug)
+ __memblock_dump_all();
+}
+
+void __init memblock_allow_resize(void)
+{
+ memblock_can_resize = 1;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int order;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1UL, __ffs(start));
+
+ while (start + (1UL << order) > end)
+ order--;
+
+ memblock_free_pages(pfn_to_page(start), start, order);
+
+ start += (1UL << order);
+ }
+}
+
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn >= end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
+static unsigned long __init free_low_memory_core_early(void)
+{
+ unsigned long count = 0;
+ phys_addr_t start, end;
+ u64 i;
+
+ memblock_clear_hotplug(0, -1);
+
+ for_each_reserved_mem_range(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ /*
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesn't have RAM installed
+ * low ram will be on Node1
+ */
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL)
+ count += __free_memory_core(start, end);
+
+ return count;
+}
+
+static int reset_managed_pages_done __initdata;
+
+void reset_node_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ atomic_long_set(&z->managed_pages, 0);
+}
+
+void __init reset_all_zones_managed_pages(void)
+{
+ struct pglist_data *pgdat;
+
+ if (reset_managed_pages_done)
+ return;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+
+ reset_managed_pages_done = 1;
+}
+
+/**
+ * memblock_free_all - release free pages to the buddy allocator
+ *
+ * Return: the number of pages actually released.
+ */
+unsigned long __init memblock_free_all(void)
+{
+ unsigned long pages;
+
+ reset_all_zones_managed_pages();
+
+ pages = free_low_memory_core_early();
+ totalram_pages_add(pages);
+
+ return pages;
+}
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+ phys_addr_t end;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ end = reg->base + reg->size - 1;
+
+ seq_printf(m, "%4d: ", i);
+ seq_printf(m, "%pa..%pa\n", &reg->base, &end);
+ }
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(memblock_debug);
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+
+ debugfs_create_file("memory", 0444, root,
+ &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", 0444, root,
+ &memblock.reserved, &memblock_debug_fops);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ debugfs_create_file("physmem", 0444, root, &physmem,
+ &memblock_debug_fops);
+#endif
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 000000000..ddc8ed096
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,7535 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Memory thresholds
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/page_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
+#include <linux/smp.h>
+#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/limits.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/vmpressure.h>
+#include <linux/mm_inline.h>
+#include <linux/swap_cgroup.h>
+#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/lockdep.h>
+#include <linux/file.h>
+#include <linux/tracehook.h>
+#include <linux/psi.h>
+#include <linux/seq_buf.h>
+#include "internal.h"
+#include <net/sock.h>
+#include <net/ip.h>
+#include "slab.h"
+
+#include <linux/uaccess.h>
+
+#include <trace/events/vmscan.h>
+
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
+
+struct mem_cgroup *root_mem_cgroup __read_mostly;
+
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+
+/* Socket memory accounting disabled? */
+static bool cgroup_memory_nosocket;
+
+/* Kernel memory accounting disabled? */
+static bool cgroup_memory_nokmem;
+
+/* Whether the swap controller is active */
+#ifdef CONFIG_MEMCG_SWAP
+bool cgroup_memory_noswap __read_mostly;
+#else
+#define cgroup_memory_noswap 1
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+}
+
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_node {
+ struct rb_root rb_root;
+ struct rb_node *rb_rightmost;
+ spinlock_t lock;
+};
+
+struct mem_cgroup_tree {
+ struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_entry_t wait;
+ struct work_struct remove;
+};
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+
+/* Stuffs for move charges at task migration. */
+/*
+ * Types of charges to be moved.
+ */
+#define MOVE_ANON 0x1U
+#define MOVE_FILE 0x2U
+#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
+
+/* "mc" and its members are protected by cgroup_mutex */
+static struct move_charge_struct {
+ spinlock_t lock; /* for from, to */
+ struct mm_struct *mm;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ unsigned long flags;
+ unsigned long precharge;
+ unsigned long moved_charge;
+ unsigned long moved_swap;
+ struct task_struct *moving_task; /* a task moving charges */
+ wait_queue_head_t waitq; /* a waitq for other context */
+} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
+ .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
+};
+
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
+
+/* for encoding cft->private value on file */
+enum res_type {
+ _MEM,
+ _MEMSWAP,
+ _OOM_TYPE,
+ _KMEM,
+ _TCP,
+};
+
+#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
+#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/* Used for OOM nofiier */
+#define OOM_CONTROL (0)
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
+static inline bool task_is_dying(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ return &memcg->vmpressure;
+}
+
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+ return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_SPINLOCK(objcg_lock);
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ struct mem_cgroup *memcg;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ /*
+ * At this point all allocated objects are freed, and
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+ *
+ * The following sequence can lead to it:
+ * 1) CPU0: objcg == stock->cached_objcg
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+ * PAGE_SIZE bytes are charged
+ * 3) CPU1: a process from another memcg is allocating something,
+ * the stock if flushed,
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
+ * 5) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes
+ * 6) CPU0: stock is flushed,
+ * 92 bytes are added to objcg->nr_charged_bytes
+ *
+ * In the result, nr_charged_bytes == PAGE_SIZE.
+ * This page will be uncharged in obj_cgroup_release().
+ */
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&objcg_lock, flags);
+ memcg = obj_cgroup_memcg(objcg);
+ if (nr_pages)
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ list_del(&objcg->list);
+ mem_cgroup_put(memcg);
+ spin_unlock_irqrestore(&objcg_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+ struct obj_cgroup *objcg;
+ int ret;
+
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
+ return NULL;
+
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(objcg);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&objcg->list);
+ return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct obj_cgroup *objcg, *iter;
+
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+ spin_lock_irq(&objcg_lock);
+
+ /* Move active objcg to the parent's list */
+ xchg(&objcg->memcg, parent);
+ css_get(&parent->css);
+ list_add(&objcg->list, &parent->objcg_list);
+
+ /* Move already reparented objcgs to the parent's list */
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
+ css_get(&parent->css);
+ xchg(&iter->memcg, parent);
+ css_put(&memcg->css);
+ }
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+ spin_unlock_irq(&objcg_lock);
+
+ percpu_ref_kill(&objcg->refcnt);
+}
+
+/*
+ * This will be used as a shrinker list's index.
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
+ */
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+ down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+ up_read(&memcg_cache_ids_sem);
+}
+
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
+
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+#endif
+
+static int memcg_shrinker_map_size;
+static DEFINE_MUTEX(memcg_shrinker_map_mutex);
+
+static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+}
+
+static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
+ int size, int old_size)
+{
+ struct memcg_shrinker_map *new, *old;
+ int nid;
+
+ lockdep_assert_held(&memcg_shrinker_map_mutex);
+
+ for_each_node(nid) {
+ old = rcu_dereference_protected(
+ mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+ if (!new)
+ return -ENOMEM;
+
+ /* Set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_size);
+ memset((void *)new->map + old_size, 0, size - old_size);
+
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+ call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
+ }
+
+ return 0;
+}
+
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct memcg_shrinker_map *map;
+ int nid;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ for_each_node(nid) {
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ map = rcu_dereference_protected(pn->shrinker_map, true);
+ if (map)
+ kvfree(map);
+ rcu_assign_pointer(pn->shrinker_map, NULL);
+ }
+}
+
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct memcg_shrinker_map *map;
+ int nid, size, ret = 0;
+
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ size = memcg_shrinker_map_size;
+ for_each_node(nid) {
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
+ if (!map) {
+ memcg_free_shrinker_maps(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+ }
+ mutex_unlock(&memcg_shrinker_map_mutex);
+
+ return ret;
+}
+
+int memcg_expand_shrinker_maps(int new_id)
+{
+ int size, old_size, ret = 0;
+ struct mem_cgroup *memcg;
+
+ size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
+ old_size = memcg_shrinker_map_size;
+ if (size <= old_size)
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ if (!root_mem_cgroup)
+ goto unlock;
+
+ for_each_mem_cgroup(memcg) {
+ if (mem_cgroup_is_root(memcg))
+ continue;
+ ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
+ goto unlock;
+ }
+ }
+unlock:
+ if (!ret)
+ memcg_shrinker_map_size = size;
+ mutex_unlock(&memcg_shrinker_map_mutex);
+ return ret;
+}
+
+void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct memcg_shrinker_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, map->map);
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned. The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = page->mem_cgroup;
+
+ if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg = root_mem_cgroup;
+
+ return &memcg->css;
+}
+
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ino = 0;
+
+ rcu_read_lock();
+ memcg = page->mem_cgroup;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
+
+ while (memcg && !(memcg->css.flags & CSS_ONLINE))
+ memcg = parent_mem_cgroup(memcg);
+ if (memcg)
+ ino = cgroup_ino(memcg->css.cgroup);
+ rcu_read_unlock();
+ return ino;
+}
+
+static struct mem_cgroup_per_node *
+mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
+{
+ int nid = page_to_nid(page);
+
+ return memcg->nodeinfo[nid];
+}
+
+static struct mem_cgroup_tree_per_node *
+soft_limit_tree_node(int nid)
+{
+ return soft_limit_tree.rb_tree_per_node[nid];
+}
+
+static struct mem_cgroup_tree_per_node *
+soft_limit_tree_from_page(struct page *page)
+{
+ int nid = page_to_nid(page);
+
+ return soft_limit_tree.rb_tree_per_node[nid];
+}
+
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz,
+ unsigned long new_usage_in_excess)
+{
+ struct rb_node **p = &mctz->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct mem_cgroup_per_node *mz_node;
+ bool rightmost = true;
+
+ if (mz->on_tree)
+ return;
+
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
+ while (*p) {
+ parent = *p;
+ mz_node = rb_entry(parent, struct mem_cgroup_per_node,
+ tree_node);
+ if (mz->usage_in_excess < mz_node->usage_in_excess) {
+ p = &(*p)->rb_left;
+ rightmost = false;
+ }
+
+ /*
+ * We can't avoid mem cgroups that are over their soft
+ * limit by the same amount
+ */
+ else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ p = &(*p)->rb_right;
+ }
+
+ if (rightmost)
+ mctz->rb_rightmost = &mz->tree_node;
+
+ rb_link_node(&mz->tree_node, parent, p);
+ rb_insert_color(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = true;
+}
+
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz)
+{
+ if (!mz->on_tree)
+ return;
+
+ if (&mz->tree_node == mctz->rb_rightmost)
+ mctz->rb_rightmost = rb_prev(&mz->tree_node);
+
+ rb_erase(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = false;
+}
+
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ spin_unlock_irqrestore(&mctz->lock, flags);
+}
+
+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
+{
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
+ unsigned long excess = 0;
+
+ if (nr_pages > soft_limit)
+ excess = nr_pages - soft_limit;
+
+ return excess;
+}
+
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+ unsigned long excess;
+ struct mem_cgroup_per_node *mz;
+ struct mem_cgroup_tree_per_node *mctz;
+
+ mctz = soft_limit_tree_from_page(page);
+ if (!mctz)
+ return;
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_page_nodeinfo(memcg, page);
+ excess = soft_limit_excess(memcg);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ spin_unlock_irqrestore(&mctz->lock, flags);
+ }
+ }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_tree_per_node *mctz;
+ struct mem_cgroup_per_node *mz;
+ int nid;
+
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(memcg, nid);
+ mctz = soft_limit_tree_node(nid);
+ if (mctz)
+ mem_cgroup_remove_exceeded(mz, mctz);
+ }
+}
+
+static struct mem_cgroup_per_node *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+{
+ struct mem_cgroup_per_node *mz;
+
+retry:
+ mz = NULL;
+ if (!mctz->rb_rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mz = rb_entry(mctz->rb_rightmost,
+ struct mem_cgroup_per_node, tree_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ if (!soft_limit_excess(mz->memcg) ||
+ !css_tryget(&mz->memcg->css))
+ goto retry;
+done:
+ return mz;
+}
+
+static struct mem_cgroup_per_node *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+{
+ struct mem_cgroup_per_node *mz;
+
+ spin_lock_irq(&mctz->lock);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ spin_unlock_irq(&mctz->lock);
+ return mz;
+}
+
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
+{
+ long x, threshold = MEMCG_CHARGE_BATCH;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (memcg_stat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+ if (unlikely(abs(x) > threshold)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmstats[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(pn->memcg);
+ if (!parent)
+ return NULL;
+ return mem_cgroup_nodeinfo(parent, nid);
+}
+
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ long x, threshold = MEMCG_CHARGE_BATCH;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ /* Update memcg */
+ __mod_memcg_state(memcg, idx, val);
+
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > threshold)) {
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pi;
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ /* Update node */
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+ /* Update memcg and lruvec */
+ if (!mem_cgroup_disabled())
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+{
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+
+ /*
+ * Untracked pages have no memcg, no lruvec. Update only the
+ * node. If we reparent the slab objects to the root memcg,
+ * when we free the slab object, we need to update the per-memcg
+ * vmstats to keep it correct for the root memcg.
+ */
+ if (!memcg) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
+ rcu_read_unlock();
+}
+
+void mod_memcg_obj_state(void *p, int idx, int val)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
+ rcu_read_unlock();
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count)
+{
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmevents[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+{
+ return atomic_long_read(&memcg->vmevents[event]);
+}
+
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ return x;
+}
+
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
+ struct page *page,
+ int nr_pages)
+{
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
+ __count_memcg_events(memcg, PGPGIN, 1);
+ else {
+ __count_memcg_events(memcg, PGPGOUT, 1);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
+}
+
+static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
+{
+ unsigned long val, next;
+
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
+ /* from time_after() in jiffies.h */
+ if ((long)(next - val) < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+{
+ /* threshold event is triggered in finer grain than soft limit */
+ if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit;
+
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ mem_cgroup_update_tree(memcg, page);
+ }
+}
+
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
+ return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
+}
+EXPORT_SYMBOL(mem_cgroup_from_task);
+
+/**
+ * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
+ * @mm: mm from which memcg should be extracted. It can be NULL.
+ *
+ * Obtain a reference on mm->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
+ * returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ do {
+ /*
+ * Page cache insertions can happen withou an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ memcg = root_mem_cgroup;
+ }
+ } while (!css_tryget(&memcg->css));
+ rcu_read_unlock();
+ return memcg;
+}
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
+
+/**
+ * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
+ * @page: page from which memcg should be extracted.
+ *
+ * Obtain a reference on page->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ /* Page should not get uncharged and freed memcg under us. */
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+EXPORT_SYMBOL(get_mem_cgroup_from_page);
+
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
+
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = active_memcg();
+ /* remote memcg must hold a ref. */
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static __always_inline bool memcg_kmem_bypass(void)
+{
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+
+/**
+ * If active memcg is set, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ if (unlikely(active_memcg()))
+ return get_active_memcg();
+
+ return get_mem_cgroup_from_mm(current->mm);
+}
+
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Returns references to children of the hierarchy below @root, or
+ * @root itself, or %NULL after a full round-trip.
+ *
+ * Caller must pass the return value in @prev on subsequent
+ * invocations for reference counting, or use mem_cgroup_iter_break()
+ * to cancel a hierarchy walk before the round-trip is complete.
+ *
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+ struct mem_cgroup *prev,
+ struct mem_cgroup_reclaim_cookie *reclaim)
+{
+ struct mem_cgroup_reclaim_iter *iter;
+ struct cgroup_subsys_state *css = NULL;
+ struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *pos = NULL;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ if (!root)
+ root = root_mem_cgroup;
+
+ if (prev && !reclaim)
+ pos = prev;
+
+ if (!root->use_hierarchy && root != root_mem_cgroup) {
+ if (prev)
+ goto out;
+ return root;
+ }
+
+ rcu_read_lock();
+
+ if (reclaim) {
+ struct mem_cgroup_per_node *mz;
+
+ mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+ iter = &mz->iter;
+
+ if (prev && reclaim->generation != iter->generation)
+ goto out_unlock;
+
+ while (1) {
+ pos = READ_ONCE(iter->position);
+ if (!pos || css_tryget(&pos->css))
+ break;
+ /*
+ * css reference reached zero, so iter->position will
+ * be cleared by ->css_released. However, we should not
+ * rely on this happening soon, because ->css_released
+ * is called from a work queue, and by busy-waiting we
+ * might block it. So we clear iter->position right
+ * away.
+ */
+ (void)cmpxchg(&iter->position, pos, NULL);
+ }
+ }
+
+ if (pos)
+ css = &pos->css;
+
+ for (;;) {
+ css = css_next_descendant_pre(css, &root->css);
+ if (!css) {
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ continue;
+ break;
+ }
+
+ /*
+ * Verify the css and acquire a reference. The root
+ * is provided by the caller, so we know it's alive
+ * and kicking, and don't take an extra reference.
+ */
+ memcg = mem_cgroup_from_css(css);
+
+ if (css == &root->css)
+ break;
+
+ if (css_tryget(css))
+ break;
+
+ memcg = NULL;
+ }
+
+ if (reclaim) {
+ /*
+ * The position could have already been updated by a competing
+ * thread, so check that the value hasn't changed since we read
+ * it to avoid reclaiming from the same cgroup twice.
+ */
+ (void)cmpxchg(&iter->position, pos, memcg);
+
+ if (pos)
+ css_put(&pos->css);
+
+ if (!memcg)
+ iter->generation++;
+ else if (!prev)
+ reclaim->generation = iter->generation;
+ }
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ if (prev && prev != root)
+ css_put(&prev->css);
+
+ return memcg;
+}
+
+/**
+ * mem_cgroup_iter_break - abort a hierarchy walk prematurely
+ * @root: hierarchy root
+ * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ */
+void mem_cgroup_iter_break(struct mem_cgroup *root,
+ struct mem_cgroup *prev)
+{
+ if (!root)
+ root = root_mem_cgroup;
+ if (prev && prev != root)
+ css_put(&prev->css);
+}
+
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_node *mz;
+ int nid;
+
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ iter = &mz->iter;
+ cmpxchg(&iter->position, dead_memcg, NULL);
+ }
+}
+
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
+/**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
+{
+ struct mem_cgroup *iter;
+ int ret = 0;
+
+ BUG_ON(memcg == root_mem_cgroup);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret = fn(task, arg);
+ css_task_iter_end(&it);
+ if (ret) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ }
+ return ret;
+}
+
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable - see the
+ * access rules in commit_charge().
+ */
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
+{
+ struct mem_cgroup_per_node *mz;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ if (mem_cgroup_disabled()) {
+ lruvec = &pgdat->__lruvec;
+ goto out;
+ }
+
+ memcg = page->mem_cgroup;
+ /*
+ * Swapcache readahead pages are added to the LRU - and
+ * possibly migrated - before they are charged.
+ */
+ if (!memcg)
+ memcg = root_mem_cgroup;
+
+ mz = mem_cgroup_page_nodeinfo(memcg, page);
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->zone here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->pgdat != pgdat))
+ lruvec->pgdat = pgdat;
+ return lruvec;
+}
+
+/**
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
+ * @lruvec: mem_cgroup per zone lru vector
+ * @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
+ * @nr_pages: positive when adding or negative when removing
+ *
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
+ */
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int zid, int nr_pages)
+{
+ struct mem_cgroup_per_node *mz;
+ unsigned long *lru_size;
+ long size;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ lru_size = &mz->lru_zone_size[zid][lru];
+
+ if (nr_pages < 0)
+ *lru_size += nr_pages;
+
+ size = *lru_size;
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
+ VM_BUG_ON(1);
+ *lru_size = 0;
+ }
+
+ if (nr_pages > 0)
+ *lru_size += nr_pages;
+}
+
+/**
+ * mem_cgroup_margin - calculate chargeable space of a memory cgroup
+ * @memcg: the memory cgroup
+ *
+ * Returns the maximum amount of memory @mem can be charged with, in
+ * pages.
+ */
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
+{
+ unsigned long margin = 0;
+ unsigned long count;
+ unsigned long limit;
+
+ count = page_counter_read(&memcg->memory);
+ limit = READ_ONCE(memcg->memory.max);
+ if (count < limit)
+ margin = limit - count;
+
+ if (do_memsw_account()) {
+ count = page_counter_read(&memcg->memsw);
+ limit = READ_ONCE(memcg->memsw.max);
+ if (count < limit)
+ margin = min(margin, limit - count);
+ else
+ margin = 0;
+ }
+
+ return margin;
+}
+
+/*
+ * A routine for checking "mem" is under move_account() or not.
+ *
+ * Checking a cgroup is mc.from or mc.to or under hierarchy of
+ * moving cgroups. This is for waiting at high-memory pressure
+ * caused by "move".
+ */
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+
+ ret = mem_cgroup_is_descendant(from, memcg) ||
+ mem_cgroup_is_descendant(to, memcg);
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(memcg)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
+
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
+
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
+ memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
+ memcg_events(memcg, PGMAJFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
+ memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
+ memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+ memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
+ memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
+ memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+/**
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ * @p: Task that is going to be killed
+ *
+ * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
+ * enabled
+ */
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
+{
+ rcu_read_lock();
+
+ if (memcg) {
+ pr_cont(",oom_memcg=");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ } else
+ pr_cont(",global_oom");
+ if (p) {
+ pr_cont(",task_memcg=");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+ char *buf;
+
+ pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memory)),
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
+ }
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
+{
+ unsigned long max = READ_ONCE(memcg->memory.max);
+
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
+
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
+ }
+ return max;
+}
+
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
+{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .memcg = memcg,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
+ bool ret = true;
+
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = task_is_dying() || out_of_memory(&oc);
+
+unlock:
+ mutex_unlock(&oom_lock);
+ return ret;
+}
+
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+ pg_data_t *pgdat,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ struct mem_cgroup *victim = NULL;
+ int total = 0;
+ int loop = 0;
+ unsigned long excess;
+ unsigned long nr_scanned;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .pgdat = pgdat,
+ };
+
+ excess = soft_limit_excess(root_memcg);
+
+ while (1) {
+ victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+ if (!victim) {
+ loop++;
+ if (loop >= 2) {
+ /*
+ * If we have not been able to reclaim
+ * anything, it might because there are
+ * no reclaimable pages under this hierarchy
+ */
+ if (!total)
+ break;
+ /*
+ * We want to do more targeted reclaim.
+ * excess >> 2 is not to excessive so as to
+ * reclaim too much, nor too less that we keep
+ * coming back to reclaim from this cgroup
+ */
+ if (total >= (excess >> 2) ||
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+ break;
+ }
+ continue;
+ }
+ total += mem_cgroup_shrink_node(victim, gfp_mask, false,
+ pgdat, &nr_scanned);
+ *total_scanned += nr_scanned;
+ if (!soft_limit_excess(root_memcg))
+ break;
+ }
+ mem_cgroup_iter_break(root_memcg, victim);
+ return total;
+}
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter, *failed = NULL;
+
+ spin_lock(&memcg_oom_lock);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter->oom_lock) {
+ /*
+ * this subtree of our hierarchy is already locked
+ * so we cannot give a lock.
+ */
+ failed = iter;
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ } else
+ iter->oom_lock = true;
+ }
+
+ if (failed) {
+ /*
+ * OK, we failed to lock the whole subtree so we have
+ * to clean up what we set up to the failing subtree
+ */
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == failed) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ iter->oom_lock = false;
+ }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
+
+ spin_unlock(&memcg_oom_lock);
+
+ return !failed;
+}
+
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
+ for_each_mem_cgroup_tree(iter, memcg)
+ iter->oom_lock = false;
+ spin_unlock(&memcg_oom_lock);
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ spin_lock(&memcg_oom_lock);
+ for_each_mem_cgroup_tree(iter, memcg)
+ iter->under_oom++;
+ spin_unlock(&memcg_oom_lock);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ /*
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
+ */
+ spin_lock(&memcg_oom_lock);
+ for_each_mem_cgroup_tree(iter, memcg)
+ if (iter->under_oom > 0)
+ iter->under_oom--;
+ spin_unlock(&memcg_oom_lock);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+struct oom_wait_info {
+ struct mem_cgroup *memcg;
+ wait_queue_entry_t wait;
+};
+
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
+ unsigned mode, int sync, void *arg)
+{
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
+ struct mem_cgroup *oom_wait_memcg;
+ struct oom_wait_info *oom_wait_info;
+
+ oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+ oom_wait_memcg = oom_wait_info->memcg;
+
+ if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
+ !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, arg);
+}
+
+static void memcg_oom_recover(struct mem_cgroup *memcg)
+{
+ /*
+ * For the following lockless ->under_oom test, the only required
+ * guarantee is that it must see the state asserted by an OOM when
+ * this function is called as a result of userland actions
+ * triggered by the notification of the OOM. This is trivially
+ * achieved by invoking mem_cgroup_mark_under_oom() before
+ * triggering notification.
+ */
+ if (memcg && memcg->under_oom)
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+}
+
+enum oom_status {
+ OOM_SUCCESS,
+ OOM_FAILED,
+ OOM_ASYNC,
+ OOM_SKIPPED
+};
+
+static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+{
+ enum oom_status ret;
+ bool locked;
+
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return OOM_SKIPPED;
+
+ memcg_memory_event(memcg, MEMCG_OOM);
+
+ /*
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * cgroup1 allows disabling the OOM killer and waiting for outside
+ * handling until the charge can succeed; remember the context and put
+ * the task to sleep at the end of the page fault when all locks are
+ * released.
+ *
+ * On the other hand, in-kernel OOM killer allows for an async victim
+ * memory reclaim (oom_reaper) and that means that we are not solely
+ * relying on the oom victim to make a forward progress and we can
+ * invoke the oom killer here.
+ *
+ * Please note that mem_cgroup_out_of_memory might fail to find a
+ * victim and then we have to bail out from the charge path.
+ */
+ if (memcg->oom_kill_disable) {
+ if (!current->in_user_fault)
+ return OOM_SKIPPED;
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+
+ return OOM_ASYNC;
+ }
+
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
+
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
+
+ return ret;
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
+ *
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
+ *
+ * Memcg supports userspace OOM handling where failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation. Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to complete the OOM handling.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * completed, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(bool handle)
+{
+ struct mem_cgroup *memcg = current->memcg_in_oom;
+ struct oom_wait_info owait;
+ bool locked;
+
+ /* OOM is global, do not handle */
+ if (!memcg)
+ return false;
+
+ if (!handle)
+ goto cleanup;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.entry);
+
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+ current->memcg_oom_order);
+ } else {
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
+ }
+cleanup:
+ current->memcg_in_oom = NULL;
+ css_put(&memcg->css);
+ return true;
+}
+
+/**
+ * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
+ * @victim: task to be killed by the OOM killer
+ * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
+ *
+ * Returns a pointer to a memory cgroup, which has to be cleaned up
+ * by killing all belonging OOM-killable tasks.
+ *
+ * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
+ */
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+ struct mem_cgroup *oom_domain)
+{
+ struct mem_cgroup *oom_group = NULL;
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return NULL;
+
+ if (!oom_domain)
+ oom_domain = root_mem_cgroup;
+
+ rcu_read_lock();
+
+ memcg = mem_cgroup_from_task(victim);
+ if (memcg == root_mem_cgroup)
+ goto out;
+
+ /*
+ * If the victim task has been asynchronously moved to a different
+ * memory cgroup, we might end up killing tasks outside oom_domain.
+ * In this case it's better to ignore memory.group.oom.
+ */
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
+ goto out;
+
+ /*
+ * Traverse the memory cgroup hierarchy from the victim task's
+ * cgroup up to the OOMing cgroup (or root) to find the
+ * highest-level memory cgroup with oom.group set.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ if (memcg->oom_group)
+ oom_group = memcg;
+
+ if (memcg == oom_domain)
+ break;
+ }
+
+ if (oom_group)
+ css_get(&oom_group->css);
+out:
+ rcu_read_unlock();
+
+ return oom_group;
+}
+
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+ pr_info("Tasks in ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(" are going to be killed due to memory.oom.group set\n");
+}
+
+/**
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
+ *
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
+ */
+struct mem_cgroup *lock_page_memcg(struct page *page)
+{
+ struct page *head = compound_head(page); /* rmap on tail pages */
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
+ /*
+ * The RCU lock is held throughout the transaction. The fast
+ * path can get away without acquiring the memcg->move_lock
+ * because page moving starts with an RCU grace period.
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page itself from being freed. E.g. writeback
+ * doesn't hold a page reference and relies on PG_writeback to
+ * keep off truncation, migration and so forth.
+ */
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled())
+ return NULL;
+again:
+ memcg = head->mem_cgroup;
+ if (unlikely(!memcg))
+ return NULL;
+
+ if (atomic_read(&memcg->moving_account) <= 0)
+ return memcg;
+
+ spin_lock_irqsave(&memcg->move_lock, flags);
+ if (memcg != head->mem_cgroup) {
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
+ goto again;
+ }
+
+ /*
+ * When charge migration first begins, we can have locked and
+ * unlocked page stat updates happening concurrently. Track
+ * the task who has the lock for unlock_page_memcg().
+ */
+ memcg->move_lock_task = current;
+ memcg->move_lock_flags = flags;
+
+ return memcg;
+}
+EXPORT_SYMBOL(lock_page_memcg);
+
+/**
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
+ */
+void __unlock_page_memcg(struct mem_cgroup *memcg)
+{
+ if (memcg && memcg->move_lock_task == current) {
+ unsigned long flags = memcg->move_lock_flags;
+
+ memcg->move_lock_task = NULL;
+ memcg->move_lock_flags = 0;
+
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
+ }
+
+ rcu_read_unlock();
+}
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ __unlock_page_memcg(head->mem_cgroup);
+}
+EXPORT_SYMBOL(unlock_page_memcg);
+
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cached_objcg;
+ unsigned int nr_bytes;
+#endif
+
+ struct work_struct work;
+ unsigned long flags;
+#define FLUSHING_CACHED_CHARGE 0
+};
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_MUTEX(percpu_charge_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ return false;
+}
+#endif
+
+/**
+ * consume_stock: Try to consume stocked charge on this cpu.
+ * @memcg: memcg to consume from.
+ * @nr_pages: how many pages to charge.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock. Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
+ */
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ if (nr_pages > MEMCG_CHARGE_BATCH)
+ return ret;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ stock->nr_pages -= nr_pages;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/*
+ * Returns stocks cached in percpu and reset cached information.
+ */
+static void drain_stock(struct memcg_stock_pcp *stock)
+{
+ struct mem_cgroup *old = stock->cached;
+
+ if (!old)
+ return;
+
+ if (stock->nr_pages) {
+ page_counter_uncharge(&old->memory, stock->nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&old->memsw, stock->nr_pages);
+ stock->nr_pages = 0;
+ }
+
+ css_put(&old->css);
+ stock->cached = NULL;
+}
+
+static void drain_local_stock(struct work_struct *dummy)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ /*
+ * The only protection from memory hotplug vs. drain_stock races is
+ * that we always operate on local CPU stock here with IRQ disabled
+ */
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ drain_obj_stock(stock);
+ drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Cache charges(val) to local per_cpu area.
+ * This will be consumed by consume_stock() function, later.
+ */
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached != memcg) { /* reset if necessary */
+ drain_stock(stock);
+ css_get(&memcg->css);
+ stock->cached = memcg;
+ }
+ stock->nr_pages += nr_pages;
+
+ if (stock->nr_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
+ * of the hierarchy under it.
+ */
+static void drain_all_stock(struct mem_cgroup *root_memcg)
+{
+ int cpu, curcpu;
+
+ /* If someone's already draining, avoid adding running more workers. */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
+ /*
+ * Notify other cpus that system-wide "drain" is running
+ * We do not care about races with the cpu hotplug because cpu down
+ * as well as workers from this path always operate on the local
+ * per-cpu data. CPU up doesn't touch memcg_stock at all.
+ */
+ curcpu = get_cpu();
+ for_each_online_cpu(cpu) {
+ struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+ struct mem_cgroup *memcg;
+ bool flush = false;
+
+ rcu_read_lock();
+ memcg = stock->cached;
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ if (obj_stock_flush_required(stock, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (cpu == curcpu)
+ drain_local_stock(&stock->work);
+ else
+ schedule_work_on(cpu, &stock->work);
+ }
+ }
+ put_cpu();
+ mutex_unlock(&percpu_charge_mutex);
+}
+
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
+{
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *memcg, *mi;
+
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
+
+ for_each_mem_cgroup(memcg) {
+ int i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++) {
+ int nid;
+ long x;
+
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
+ if (x)
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmstats[i]);
+
+ if (i >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn;
+
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
+ if (x)
+ do {
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
+ }
+ }
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ long x;
+
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
+ if (x)
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmevents[i]);
+ }
+ }
+
+ return 0;
+}
+
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
+{
+ unsigned long nr_reclaimed = 0;
+
+ do {
+ unsigned long pflags;
+
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
+ continue;
+
+ memcg_memory_event(memcg, MEMCG_HIGH);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
+}
+
+static void high_work_func(struct work_struct *work)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(work, struct mem_cgroup, high_work);
+ reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
+}
+
+/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+static u64 calculate_overage(unsigned long usage, unsigned long high)
+{
+ u64 overage;
+
+ if (usage <= high)
+ return 0;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
+
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
+
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
+
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
+ if (!max_overage)
+ return 0;
+
+ /*
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+ penalty_jiffies = max_overage * max_overage * HZ;
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned long penalty_jiffies;
+ unsigned long pflags;
+ unsigned long nr_reclaimed;
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
+ struct mem_cgroup *memcg;
+ bool in_retry = false;
+
+ if (likely(!nr_pages))
+ return;
+
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ current->memcg_nr_pages_over_high = 0;
+
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ */
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
+}
+
+static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
+{
+ unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
+ int nr_retries = MAX_RECLAIM_RETRIES;
+ struct mem_cgroup *mem_over_limit;
+ struct page_counter *counter;
+ enum oom_status oom_status;
+ unsigned long nr_reclaimed;
+ bool passed_oom = false;
+ bool may_swap = true;
+ bool drained = false;
+ unsigned long pflags;
+
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+retry:
+ if (consume_stock(memcg, nr_pages))
+ return 0;
+
+ if (!do_memsw_account() ||
+ page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, batch, &counter))
+ goto done_restock;
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, batch);
+ mem_over_limit = mem_cgroup_from_counter(counter, memory);
+ } else {
+ mem_over_limit = mem_cgroup_from_counter(counter, memsw);
+ may_swap = false;
+ }
+
+ if (batch > nr_pages) {
+ batch = nr_pages;
+ goto retry;
+ }
+
+ /*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
+ * Prevent unbounded recursion when reclaim operations need to
+ * allocate memory. This might exceed the limits temporarily,
+ * but we prefer facilitating memory reclaim and getting back
+ * under the limit over triggering OOM kills in these cases.
+ */
+ if (unlikely(current->flags & PF_MEMALLOC))
+ goto force;
+
+ if (unlikely(task_in_memcg_oom(current)))
+ goto nomem;
+
+ if (!gfpflags_allow_blocking(gfp_mask))
+ goto nomem;
+
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+ gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
+
+ if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
+ goto retry;
+
+ if (!drained) {
+ drain_all_stock(mem_over_limit);
+ drained = true;
+ goto retry;
+ }
+
+ if (gfp_mask & __GFP_NORETRY)
+ goto nomem;
+ /*
+ * Even though the limit is exceeded at this point, reclaim
+ * may have been able to free some pages. Retry the charge
+ * before killing the task.
+ *
+ * Only for regular pages, though: huge pages are rather
+ * unlikely to succeed so close to the limit, and we fall back
+ * to regular pages anyway in case of failure.
+ */
+ if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+ goto retry;
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ goto retry;
+
+ if (nr_retries--)
+ goto retry;
+
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ goto force;
+
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
+
+ /*
+ * keep retrying as long as the memcg oom killer is able to make
+ * a forward progress or bypass the charge if the oom killer
+ * couldn't make any progress.
+ */
+ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE));
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
+ nr_retries = MAX_RECLAIM_RETRIES;
+ goto retry;
+ }
+nomem:
+ if (!(gfp_mask & __GFP_NOFAIL))
+ return -ENOMEM;
+force:
+ /*
+ * The allocation either can't fail or will lead to more memory
+ * being freed very soon. Allow memory usage go over the limit
+ * temporarily by force charging it.
+ */
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+
+ return 0;
+
+done_restock:
+ if (batch > nr_pages)
+ refill_stock(memcg, batch - nr_pages);
+
+ /*
+ * If the hierarchy is above the normal consumption range, schedule
+ * reclaim on returning to userland. We can perform reclaim here
+ * if __GFP_RECLAIM but let's always punt for simplicity and so that
+ * GFP_KERNEL can consistently be used during reclaim. @memcg is
+ * not recorded as it most likely matches current's and won't
+ * change in the meantime. As high limit is checked again before
+ * reclaim, the cost of mismatch is negligible.
+ */
+ do {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
+ schedule_work(&memcg->high_work);
+ break;
+ }
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
+ current->memcg_nr_pages_over_high += batch;
+ set_notify_resume(current);
+ break;
+ }
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ return 0;
+}
+
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
+static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+#endif
+
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
+{
+ VM_BUG_ON_PAGE(page->mem_cgroup, page);
+ /*
+ * Any of the following ensures page->mem_cgroup stability:
+ *
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
+ */
+ page->mem_cgroup = memcg;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
+ __GFP_ACCOUNT | __GFP_NOFAIL)
+
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ gfp &= ~OBJCGS_CLEAR_MASK;
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct page *page;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ page = virt_to_head_page(p);
+
+ /*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
+ */
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
+
+ /* All other pages use page->mem_cgroup */
+ return page->mem_cgroup;
+}
+
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ rcu_read_lock();
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
+static int memcg_alloc_cache_id(void)
+{
+ int id, size;
+ int err;
+
+ id = ida_simple_get(&memcg_cache_ida,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ if (id < memcg_nr_cache_ids)
+ return id;
+
+ /*
+ * There's no space for the new id in memcg_caches arrays,
+ * so we have to grow them.
+ */
+ down_write(&memcg_cache_ids_sem);
+
+ size = 2 * (id + 1);
+ if (size < MEMCG_CACHES_MIN_SIZE)
+ size = MEMCG_CACHES_MIN_SIZE;
+ else if (size > MEMCG_CACHES_MAX_SIZE)
+ size = MEMCG_CACHES_MAX_SIZE;
+
+ err = memcg_update_all_list_lrus(size);
+ if (!err)
+ memcg_nr_cache_ids = size;
+
+ up_write(&memcg_cache_ids_sem);
+
+ if (err) {
+ ida_simple_remove(&memcg_cache_ida, id);
+ return err;
+ }
+ return id;
+}
+
+static void memcg_free_cache_id(int id)
+{
+ ida_simple_remove(&memcg_cache_ida, id);
+}
+
+/**
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
+ * @memcg: memory cgroup to charge
+ * @gfp: reclaim mode
+ * @nr_pages: number of pages to charge
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages)
+{
+ struct page_counter *counter;
+ int ret;
+
+ ret = try_charge(memcg, gfp, nr_pages);
+ if (ret)
+ return ret;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
+ cancel_charge(memcg, nr_pages);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/**
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ refill_stock(memcg, nr_pages);
+}
+
+/**
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ *
+ * Returns 0 on success, an error code on failure.
+ */
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
+{
+ struct mem_cgroup *memcg;
+ int ret = 0;
+
+ memcg = get_mem_cgroup_from_current();
+ if (memcg && !mem_cgroup_is_root(memcg)) {
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ if (!ret) {
+ page->mem_cgroup = memcg;
+ __SetPageKmemcg(page);
+ return 0;
+ }
+ css_put(&memcg->css);
+ }
+ return ret;
+}
+
+/**
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
+ * @page: page to uncharge
+ * @order: allocation order
+ */
+void __memcg_kmem_uncharge_page(struct page *page, int order)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ unsigned int nr_pages = 1 << order;
+
+ if (!memcg)
+ return;
+
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ page->mem_cgroup = NULL;
+ css_put(&memcg->css);
+
+ /* slab pages do not have PageKmemcg flag set */
+ if (PageKmemcg(page))
+ __ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->cached_objcg;
+
+ if (!old)
+ return;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(old);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ css_put(&memcg->css);
+ }
+
+ /*
+ * The leftover is flushed to the centralized per-memcg value.
+ * On the next attempt to refill obj stock it will be moved
+ * to a per-cpu stock (probably, on an other CPU), see
+ * refill_obj_stock().
+ *
+ * How often it's flushed is a trade-off between the memory
+ * limit enforcement accuracy and potential CPU contention,
+ * so it might be changed in the future.
+ */
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
+ stock->nr_bytes = 0;
+ }
+
+ obj_cgroup_put(old);
+ stock->cached_objcg = NULL;
+}
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+
+ return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->cached_objcg = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > PAGE_SIZE)
+ drain_obj_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages, nr_bytes;
+ int ret;
+
+ if (consume_obj_stock(objcg, size))
+ return 0;
+
+ /*
+ * In theory, memcg->nr_charged_bytes can have enough
+ * pre-charged bytes to satisfy the allocation. However,
+ * flushing memcg->nr_charged_bytes requires two atomic
+ * operations, and memcg->nr_charged_bytes can't be big,
+ * so it's better to ignore it and try grab some new pages.
+ * memcg->nr_charged_bytes will be flushed in
+ * refill_obj_stock(), called from this function or
+ * independently later.
+ */
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(objcg);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ nr_pages = size >> PAGE_SHIFT;
+ nr_bytes = size & (PAGE_SIZE - 1);
+
+ if (nr_bytes)
+ nr_pages += 1;
+
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ if (!ret && nr_bytes)
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+ css_put(&memcg->css);
+ return ret;
+}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+ refill_obj_stock(objcg, size);
+}
+
+#endif /* CONFIG_MEMCG_KMEM */
+
+/*
+ * Because head->mem_cgroup is not set on tails, set it now.
+ */
+void split_page_memcg(struct page *head, unsigned int nr)
+{
+ struct mem_cgroup *memcg = head->mem_cgroup;
+ int kmemcg = PageKmemcg(head);
+ int i;
+
+ if (mem_cgroup_disabled() || !memcg)
+ return;
+
+ for (i = 1; i < nr; i++) {
+ head[i].mem_cgroup = memcg;
+ if (kmemcg)
+ __SetPageKmemcg(head + i);
+ }
+ css_get_many(&memcg->css, nr - 1);
+}
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called page_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ unsigned short old_id, new_id;
+
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
+
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ mod_memcg_state(from, MEMCG_SWAP, -1);
+ mod_memcg_state(to, MEMCG_SWAP, 1);
+ return 0;
+ }
+ return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ return -EINVAL;
+}
+#endif
+
+static DEFINE_MUTEX(memcg_max_mutex);
+
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+ unsigned long max, bool memsw)
+{
+ bool enlarge = false;
+ bool drained = false;
+ int ret;
+ bool limits_invariant;
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
+
+ do {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ mutex_lock(&memcg_max_mutex);
+ /*
+ * Make sure that the new limit (memsw or memory limit) doesn't
+ * break our basic invariant rule memory.max <= memsw.max.
+ */
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
+ max <= memcg->memsw.max;
+ if (!limits_invariant) {
+ mutex_unlock(&memcg_max_mutex);
+ ret = -EINVAL;
+ break;
+ }
+ if (max > counter->max)
+ enlarge = true;
+ ret = page_counter_set_max(counter, max);
+ mutex_unlock(&memcg_max_mutex);
+
+ if (!ret)
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ if (!try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, !memsw)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
+
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
+
+ return ret;
+}
+
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup_per_node *mz, *next_mz = NULL;
+ unsigned long reclaimed;
+ int loop = 0;
+ struct mem_cgroup_tree_per_node *mctz;
+ unsigned long excess;
+ unsigned long nr_scanned;
+
+ if (order > 0)
+ return 0;
+
+ mctz = soft_limit_tree_node(pgdat->node_id);
+
+ /*
+ * Do not even bother to check the largest node if the root
+ * is empty. Do it lockless to prevent lock bouncing. Races
+ * are acceptable as soft limit is best effort anyway.
+ */
+ if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
+ return 0;
+
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mz)
+ mz = next_mz;
+ else
+ mz = mem_cgroup_largest_soft_limit_node(mctz);
+ if (!mz)
+ break;
+
+ nr_scanned = 0;
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
+ gfp_mask, &nr_scanned);
+ nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
+ spin_lock_irq(&mctz->lock);
+ __mem_cgroup_remove_exceeded(mz, mctz);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mz = NULL;
+ if (!reclaimed)
+ next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
+
+ excess = soft_limit_excess(mz->memcg);
+ /*
+ * One school of thought says that we should not add
+ * back the node to the tree if reclaim returns 0.
+ * But our reclaim could return 0, simply because due
+ * to priority we are exposing a smaller subset of
+ * memory to reclaim from. Consider this as a longer
+ * term TODO.
+ */
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ spin_unlock_irq(&mctz->lock);
+ css_put(&mz->memcg->css);
+ loop++;
+ /*
+ * Could not reclaim anything and there are no more
+ * mem cgroups to try or we seem to be looping without
+ * reclaiming anything.
+ */
+ if (!nr_reclaimed &&
+ (next_mz == NULL ||
+ loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+ break;
+ } while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->memcg->css);
+ return nr_reclaimed;
+}
+
+/*
+ * Test whether @memcg has children, dead or alive. Note that this
+ * function doesn't care whether @memcg has use_hierarchy enabled and
+ * returns %true if there are child csses according to the cgroup
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = css_next_child(NULL, &memcg->css);
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MAX_RECLAIM_RETRIES;
+
+ /* we call try-to-free pages for make this cgroup empty */
+ lru_add_drain_all();
+
+ drain_all_stock(memcg);
+
+ /* try to free all pages in this cgroup */
+ while (nr_retries && page_counter_read(&memcg->memory)) {
+ int progress;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ progress = try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, true);
+ if (!progress) {
+ nr_retries--;
+ /* maybe some writeback is necessary */
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ }
+
+ return 0;
+}
+
+static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ return mem_cgroup_force_empty(memcg) ?: nbytes;
+}
+
+static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_css(css)->use_hierarchy;
+}
+
+static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ int retval = 0;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
+
+ if (memcg->use_hierarchy == val)
+ return 0;
+
+ /*
+ * If parent's use_hierarchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can
+ * occur, provided the current cgroup has no children.
+ *
+ * For the root cgroup, parent_mem is NULL, we allow value to be
+ * set if there are no children.
+ */
+ if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
+ (val == 1 || val == 0)) {
+ if (!memcg_has_children(memcg))
+ memcg->use_hierarchy = val;
+ else
+ retval = -EBUSY;
+ } else
+ retval = -EINVAL;
+
+ return retval;
+}
+
+static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ unsigned long val;
+
+ if (mem_cgroup_is_root(memcg)) {
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
+ memcg_page_state(memcg, NR_ANON_MAPPED);
+ if (swap)
+ val += memcg_page_state(memcg, MEMCG_SWAP);
+ } else {
+ if (!swap)
+ val = page_counter_read(&memcg->memory);
+ else
+ val = page_counter_read(&memcg->memsw);
+ }
+ return val;
+}
+
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+ RES_SOFT_LIMIT,
+};
+
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct page_counter *counter;
+
+ switch (MEMFILE_TYPE(cft->private)) {
+ case _MEM:
+ counter = &memcg->memory;
+ break;
+ case _MEMSWAP:
+ counter = &memcg->memsw;
+ break;
+ case _KMEM:
+ counter = &memcg->kmem;
+ break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
+ default:
+ BUG();
+ }
+
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ if (counter == &memcg->memory)
+ return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
+ if (counter == &memcg->memsw)
+ return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
+ return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_LIMIT:
+ return (u64)counter->max * PAGE_SIZE;
+ case RES_MAX_USAGE:
+ return (u64)counter->watermark * PAGE_SIZE;
+ case RES_FAILCNT:
+ return counter->failcnt;
+ case RES_SOFT_LIMIT:
+ return (u64)memcg->soft_limit * PAGE_SIZE;
+ default:
+ BUG();
+ }
+}
+
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+{
+ unsigned long stat[MEMCG_NR_STAT] = {0};
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+ int memcg_id;
+
+ if (cgroup_memory_nokmem)
+ return 0;
+
+ BUG_ON(memcg->kmemcg_id >= 0);
+ BUG_ON(memcg->kmem_state);
+
+ memcg_id = memcg_alloc_cache_id();
+ if (memcg_id < 0)
+ return memcg_id;
+
+ objcg = obj_cgroup_alloc();
+ if (!objcg) {
+ memcg_free_cache_id(memcg_id);
+ return -ENOMEM;
+ }
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ static_branch_enable(&memcg_kmem_enabled_key);
+
+ /*
+ * A memory cgroup is considered kmem-online as soon as it gets
+ * kmemcg_id. Setting the id after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg->kmemcg_id = memcg_id;
+ memcg->kmem_state = KMEM_ONLINE;
+
+ return 0;
+}
+
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (memcg->kmem_state != KMEM_ONLINE)
+ return;
+
+ memcg->kmem_state = KMEM_ALLOCATED;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ memcg_reparent_objcgs(memcg, parent);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
+ */
+ rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ rcu_read_unlock();
+
+ memcg_drain_all_list_lrus(kmemcg_id, parent);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+}
+#else
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+ unsigned long max)
+{
+ int ret;
+
+ mutex_lock(&memcg_max_mutex);
+ ret = page_counter_set_max(&memcg->kmem, max);
+ mutex_unlock(&memcg_max_mutex);
+ return ret;
+}
+
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
+{
+ int ret;
+
+ mutex_lock(&memcg_max_mutex);
+
+ ret = page_counter_set_max(&memcg->tcpmem, max);
+ if (ret)
+ goto out;
+
+ if (!memcg->tcpmem_active) {
+ /*
+ * The active flag needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See mem_cgroup_sk_alloc()
+ * for details, and note that we don't mark any socket as
+ * belonging to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in mem_cgroup_sk_alloc(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ */
+ static_branch_inc(&memcg_sockets_enabled_key);
+ memcg->tcpmem_active = true;
+ }
+out:
+ mutex_unlock(&memcg_max_mutex);
+ return ret;
+}
+
+/*
+ * The user of this function is...
+ * RES_LIMIT.
+ */
+static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
+ int ret;
+
+ buf = strstrip(buf);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
+ if (ret)
+ return ret;
+
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
+ case RES_LIMIT:
+ if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
+ switch (MEMFILE_TYPE(of_cft(of)->private)) {
+ case _MEM:
+ ret = mem_cgroup_resize_max(memcg, nr_pages, false);
+ break;
+ case _MEMSWAP:
+ ret = mem_cgroup_resize_max(memcg, nr_pages, true);
+ break;
+ case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+ ret = memcg_update_kmem_max(memcg, nr_pages);
+ break;
+ case _TCP:
+ ret = memcg_update_tcp_max(memcg, nr_pages);
+ break;
+ }
+ break;
+ case RES_SOFT_LIMIT:
+ memcg->soft_limit = nr_pages;
+ ret = 0;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct page_counter *counter;
+
+ switch (MEMFILE_TYPE(of_cft(of)->private)) {
+ case _MEM:
+ counter = &memcg->memory;
+ break;
+ case _MEMSWAP:
+ counter = &memcg->memsw;
+ break;
+ case _KMEM:
+ counter = &memcg->kmem;
+ break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
+ default:
+ BUG();
+ }
+
+ switch (MEMFILE_ATTR(of_cft(of)->private)) {
+ case RES_MAX_USAGE:
+ page_counter_reset_watermark(counter);
+ break;
+ case RES_FAILCNT:
+ counter->failcnt = 0;
+ break;
+ default:
+ BUG();
+ }
+
+ return nbytes;
+}
+
+static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_css(css)->move_charge_at_immigrate;
+}
+
+#ifdef CONFIG_MMU
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
+ if (val & ~MOVE_MASK)
+ return -EINVAL;
+
+ /*
+ * No kind of locking is needed in here, because ->can_attach() will
+ * check this value once in the beginning of the process, and then carry
+ * on with stale data. This means that changes to this value will only
+ * affect task migrations starting after the change.
+ */
+ memcg->move_charge_at_immigrate = val;
+ return 0;
+}
+#else
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ return -ENOSYS;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask, bool tree)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask,
+ bool tree)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
+{
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
+
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
+ seq_putc(m, '\n');
+ }
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
+static const unsigned int memcg1_stats[] = {
+ NR_FILE_PAGES,
+ NR_ANON_MAPPED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ NR_ANON_THPS,
+#endif
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "rss_huge",
+#endif
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
+/* Universal VM events cgroup1 shows, original sort order */
+static const unsigned int memcg1_events[] = {
+ PGPGIN,
+ PGPGOUT,
+ PGFAULT,
+ PGMAJFAULT,
+};
+
+static int memcg_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ unsigned long memory, memsw;
+ struct mem_cgroup *mi;
+ unsigned int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
+ if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
+ continue;
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+ /* Hierarchical information */
+ memory = memsw = PAGE_COUNTER_MAX;
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
+ memory = min(memory, READ_ONCE(mi->memory.max));
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
+ }
+ seq_printf(m, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
+ if (do_memsw_account())
+ seq_printf(m, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
+ if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
+ continue;
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)nr * PAGE_SIZE);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+#ifdef CONFIG_DEBUG_VM
+ {
+ pg_data_t *pgdat;
+ struct mem_cgroup_per_node *mz;
+ unsigned long anon_cost = 0;
+ unsigned long file_cost = 0;
+
+ for_each_online_pgdat(pgdat) {
+ mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+
+ anon_cost += mz->lruvec.anon_cost;
+ file_cost += mz->lruvec.file_cost;
+ }
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
+ seq_printf(m, "file_cost %lu\n", file_cost);
+ }
+#endif
+
+ return 0;
+}
+
+static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return mem_cgroup_swappiness(memcg);
+}
+
+static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ if (val > 100)
+ return -EINVAL;
+
+ if (css->parent)
+ memcg->swappiness = val;
+ else
+ vm_swappiness = val;
+
+ return 0;
+}
+
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+ struct mem_cgroup_threshold_ary *t;
+ unsigned long usage;
+ int i;
+
+ rcu_read_lock();
+ if (!swap)
+ t = rcu_dereference(memcg->thresholds.primary);
+ else
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+ if (!t)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, swap);
+
+ /*
+ * current_threshold points to threshold just below or equal to usage.
+ * If it's not true, a threshold was crossed after last
+ * call of __mem_cgroup_threshold().
+ */
+ i = t->current_threshold;
+
+ /*
+ * Iterate backward over array of thresholds starting from
+ * current_threshold and check if a threshold is crossed.
+ * If none of thresholds below usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* i = current_threshold + 1 */
+ i++;
+
+ /*
+ * Iterate forward over array of thresholds starting from
+ * current_threshold+1 and check if a threshold is crossed.
+ * If none of thresholds above usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* Update current_threshold */
+ t->current_threshold = i - 1;
+unlock:
+ rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_memsw_account())
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+ const struct mem_cgroup_threshold *_a = a;
+ const struct mem_cgroup_threshold *_b = b;
+
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_eventfd_list *ev;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry(ev, &memcg->oom_notify, list)
+ eventfd_signal(ev->eventfd, 1);
+
+ spin_unlock(&memcg_oom_lock);
+ return 0;
+}
+
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ mem_cgroup_oom_notify_cb(iter);
+}
+
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long threshold;
+ unsigned long usage;
+ int i, size, ret;
+
+ ret = page_counter_memparse(args, "-1", &threshold);
+ if (ret)
+ return ret;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ /* Check if a threshold crossed before adding a new one */
+ if (thresholds->primary)
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+ /* Allocate memory for new array of thresholds */
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ new->size = size;
+
+ /* Copy thresholds (if any) to new array */
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
+
+ /* Add new threshold */
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
+
+ /* Sort thresholds. Registering of new threshold isn't time-critical */
+ sort(new->entries, size, sizeof(*new->entries),
+ compare_thresholds, NULL);
+
+ /* Find current threshold */
+ new->current_threshold = -1;
+ for (i = 0; i < size; i++) {
+ if (new->entries[i].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ } else
+ break;
+ }
+
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+
+ return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long usage;
+ int i, j, size, entries;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ if (!thresholds->primary)
+ goto unlock;
+
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ /* Calculate new number of threshold */
+ size = entries = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
+ size++;
+ else
+ entries++;
+ }
+
+ new = thresholds->spare;
+
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
+ /* Set thresholds array to NULL if we don't have thresholds */
+ if (!size) {
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
+ }
+
+ new->size = size;
+
+ /* Copy thresholds and find current threshold */
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
+ continue;
+
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used
+ * until rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ j++;
+ }
+
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup_eventfd_list *event;
+
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ spin_lock(&memcg_oom_lock);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (memcg->under_oom)
+ eventfd_signal(eventfd, 1);
+ spin_unlock(&memcg_oom_lock);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ spin_unlock(&memcg_oom_lock);
+}
+
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
+
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+ seq_printf(sf, "oom_kill %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+ return 0;
+}
+
+static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
+ if (!css->parent || !((val == 0) || (val == 1)))
+ return -EINVAL;
+
+ memcg->oom_kill_disable = val;
+ if (!val)
+ memcg_oom_recover(memcg);
+
+ return 0;
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <trace/events/writeback.h>
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+ wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+ wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+ if (!memcg->css.parent)
+ return NULL;
+
+ return &memcg->cgwb_domain;
+}
+
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page().
+ */
+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = atomic_long_read(&memcg->vmstats[idx]);
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
+ if (x < 0)
+ x = 0;
+ return x;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
+ * is a bit more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors. Note that this doesn't consider the actual amount of
+ * available memory in the system. The caller should further cap
+ * *@pheadroom accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+
+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+
+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
+ *pheadroom = PAGE_COUNTER_MAX;
+
+ while ((parent = parent_mem_cgroup(memcg))) {
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
+ READ_ONCE(memcg->memory.high));
+ unsigned long used = page_counter_read(&memcg->memory);
+
+ *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
+ memcg = parent;
+ }
+}
+
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on EPOLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ __poll_t flags = key_to_poll(key);
+
+ if (flags & EPOLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ struct dentry *cdentry;
+ const char *name;
+ char *endp;
+ int ret;
+
+ buf = strstrip(buf);
+
+ efd = simple_strtoul(buf, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buf = endp + 1;
+
+ cfd = simple_strtoul(buf, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buf = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cdentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
+ &memory_cgrp_subsys);
+ ret = -EINVAL;
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
+ goto out_put_cfile;
+ }
+
+ ret = event->register_event(memcg, event->eventfd, buf);
+ if (ret)
+ goto out_put_css;
+
+ vfs_poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return nbytes;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
+static struct cftype mem_cgroup_legacy_files[] = {
+ {
+ .name = "usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "soft_limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "failcnt",
+ .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "stat",
+ .seq_show = memcg_stat_show,
+ },
+ {
+ .name = "force_empty",
+ .write = mem_cgroup_force_empty_write,
+ },
+ {
+ .name = "use_hierarchy",
+ .write_u64 = mem_cgroup_hierarchy_write,
+ .read_u64 = mem_cgroup_hierarchy_read,
+ },
+ {
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
+ },
+ {
+ .name = "swappiness",
+ .read_u64 = mem_cgroup_swappiness_read,
+ .write_u64 = mem_cgroup_swappiness_write,
+ },
+ {
+ .name = "move_charge_at_immigrate",
+ .read_u64 = mem_cgroup_move_charge_read,
+ .write_u64 = mem_cgroup_move_charge_write,
+ },
+ {
+ .name = "oom_control",
+ .seq_show = mem_cgroup_oom_control_read,
+ .write_u64 = mem_cgroup_oom_control_write,
+ .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
+ },
+ {
+ .name = "pressure_level",
+ },
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memcg_numa_stat_show,
+ },
+#endif
+ {
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.failcnt",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+#if defined(CONFIG_MEMCG_KMEM) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
+ {
+ .name = "kmem.slabinfo",
+ .seq_show = memcg_slab_show,
+ },
+#endif
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+
+/*
+ * Private memory cgroup IDR
+ *
+ * Swap-out records and page cache shadow entries need to store memcg
+ * references in constrained space, so we maintain an ID space that is
+ * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
+ * memory-controlled cgroups to 64k.
+ *
+ * However, there usually are many references to the offline CSS after
+ * the cgroup has been destroyed, such as page cache or reclaimable
+ * slab objects, that don't need to hang on to the ID. We want to keep
+ * those dead CSS from occupying IDs, or we might quickly exhaust the
+ * relatively small ID space and prevent the creation of new cgroups
+ * even when there are much fewer than 64k cgroups - possibly none.
+ *
+ * Maintain a private 16-bit ID space for memcg, and allow the ID to
+ * be freed and recycled when it's no longer needed, which is usually
+ * when the CSS is offlined.
+ *
+ * The only exception to that are records of swapped out tmpfs/shmem
+ * pages that need to be attributed to live ancestors on swapin. But
+ * those references are manageable from userspace.
+ */
+
+static DEFINE_IDR(mem_cgroup_idr);
+
+static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+{
+ if (memcg->id.id > 0) {
+ idr_remove(&mem_cgroup_idr, memcg->id.id);
+ memcg->id.id = 0;
+ }
+}
+
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
+ unsigned int n)
+{
+ refcount_add(n, &memcg->id.ref);
+}
+
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+{
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
+ mem_cgroup_id_remove(memcg);
+
+ /* Memcg ID pins CSS */
+ css_put(&memcg->css);
+ }
+}
+
+static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
+{
+ mem_cgroup_id_put_many(memcg, 1);
+}
+
+/**
+ * mem_cgroup_from_id - look up a memcg from a memcg id
+ * @id: the memcg id to look up
+ *
+ * Caller must hold rcu_read_lock().
+ */
+struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&mem_cgroup_idr, id);
+}
+
+static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
+{
+ struct mem_cgroup_per_node *pn;
+ int tmp = node;
+ /*
+ * This routine is called against possible nodes.
+ * But it's BUG to call kmalloc() against offline node.
+ *
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ if (!pn)
+ return 1;
+
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stat_local) {
+ kfree(pn);
+ return 1;
+ }
+
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stat_cpu) {
+ free_percpu(pn->lruvec_stat_local);
+ kfree(pn);
+ return 1;
+ }
+
+ lruvec_init(&pn->lruvec);
+ pn->usage_in_excess = 0;
+ pn->on_tree = false;
+ pn->memcg = memcg;
+
+ memcg->nodeinfo[node] = pn;
+ return 0;
+}
+
+static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
+{
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+ if (!pn)
+ return;
+
+ free_percpu(pn->lruvec_stat_cpu);
+ free_percpu(pn->lruvec_stat_local);
+ kfree(pn);
+}
+
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ int node;
+
+ for_each_node(node)
+ free_mem_cgroup_per_node_info(memcg, node);
+ free_percpu(memcg->vmstats_percpu);
+ free_percpu(memcg->vmstats_local);
+ kfree(memcg);
+}
+
+static void mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
+ __mem_cgroup_free(memcg);
+}
+
+static struct mem_cgroup *mem_cgroup_alloc(void)
+{
+ struct mem_cgroup *memcg;
+ unsigned int size;
+ int node;
+ int __maybe_unused i;
+ long error = -ENOMEM;
+
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
+ if (!memcg)
+ return ERR_PTR(error);
+
+ memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
+ 1, MEM_CGROUP_ID_MAX,
+ GFP_KERNEL);
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
+ goto fail;
+ }
+
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!memcg->vmstats_local)
+ goto fail;
+
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!memcg->vmstats_percpu)
+ goto fail;
+
+ for_each_node(node)
+ if (alloc_mem_cgroup_per_node_info(memcg, node))
+ goto fail;
+
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
+
+ INIT_WORK(&memcg->high_work, high_work_func);
+ INIT_LIST_HEAD(&memcg->oom_notify);
+ mutex_init(&memcg->thresholds_lock);
+ spin_lock_init(&memcg->move_lock);
+ vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
+ memcg->socket_pressure = jiffies;
+#ifdef CONFIG_MEMCG_KMEM
+ memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->objcg_list);
+#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
+#endif
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ return memcg;
+fail:
+ mem_cgroup_id_remove(memcg);
+ __mem_cgroup_free(memcg);
+ return ERR_PTR(error);
+}
+
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg, *old_memcg;
+ long error = -ENOMEM;
+
+ old_memcg = set_active_memcg(parent);
+ memcg = mem_cgroup_alloc();
+ set_active_memcg(old_memcg);
+ if (IS_ERR(memcg))
+ return ERR_CAST(memcg);
+
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ if (parent) {
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
+ if (!parent) {
+ page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->swap, NULL);
+ page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
+ } else if (parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
+ page_counter_init(&memcg->memory, &parent->memory);
+ page_counter_init(&memcg->swap, &parent->swap);
+ page_counter_init(&memcg->kmem, &parent->kmem);
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+ } else {
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
+ /*
+ * Deeper hierachy with use_hierarchy == false doesn't make
+ * much sense so let cgroup subsystem know about this
+ * unfortunate state in our controller.
+ */
+ if (parent != root_mem_cgroup)
+ memory_cgrp_subsys.broken_hierarchy = true;
+ }
+
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
+
+ error = memcg_online_kmem(memcg);
+ if (error)
+ goto fail;
+
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_inc(&memcg_sockets_enabled_key);
+
+ return &memcg->css;
+fail:
+ mem_cgroup_id_remove(memcg);
+ mem_cgroup_free(memcg);
+ return ERR_PTR(error);
+}
+
+static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ /*
+ * A memcg must be visible for memcg_expand_shrinker_maps()
+ * by the time the maps are allocated. So, we allocate maps
+ * here, when for_each_mem_cgroup() can't skip it.
+ */
+ if (memcg_alloc_shrinker_maps(memcg)) {
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
+ }
+
+ /* Online state pins memcg ID, memcg ID pins CSS */
+ refcount_set(&memcg->id.ref, 1);
+ css_get(css);
+ return 0;
+}
+
+static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
+
+ memcg_offline_kmem(memcg);
+ wb_memcg_offline(memcg);
+
+ drain_all_stock(memcg);
+
+ mem_cgroup_id_put(memcg);
+}
+
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
+}
+
+static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
+ memcg_free_shrinker_maps(memcg);
+ memcg_free_kmem(memcg);
+ mem_cgroup_free(memcg);
+}
+
+/**
+ * mem_cgroup_css_reset - reset the states of a mem_cgroup
+ * @css: the target css
+ *
+ * Reset the states of the mem_cgroup associated with @css. This is
+ * invoked when the userland requests disabling on the default hierarchy
+ * but the memcg is pinned through dependency. The memcg should stop
+ * applying policies and should revert to the vanilla state as it may be
+ * made visible again.
+ *
+ * The current implementation only resets the essential configurations.
+ * This needs to be expanded to cover all the visible parts.
+ */
+static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ memcg_wb_domain_size_changed(memcg);
+}
+
+#ifdef CONFIG_MMU
+/* Handlers for move charge at task migration. */
+static int mem_cgroup_do_precharge(unsigned long count)
+{
+ int ret;
+
+ /* Try a single bulk charge without reclaim first, kswapd may wake */
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
+ if (!ret) {
+ mc.precharge += count;
+ return ret;
+ }
+
+ /* Try charges one by one with reclaim, but do not retry */
+ while (count--) {
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
+ if (ret)
+ return ret;
+ mc.precharge++;
+ cond_resched();
+ }
+ return 0;
+}
+
+union mc_target {
+ struct page *page;
+ swp_entry_t ent;
+};
+
+enum mc_target_type {
+ MC_TARGET_NONE = 0,
+ MC_TARGET_PAGE,
+ MC_TARGET_SWAP,
+ MC_TARGET_DEVICE,
+};
+
+static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent)
+{
+ struct page *page = vm_normal_page(vma, addr, ptent);
+
+ if (!page || !page_mapped(page))
+ return NULL;
+ if (PageAnon(page)) {
+ if (!(mc.flags & MOVE_ANON))
+ return NULL;
+ } else {
+ if (!(mc.flags & MOVE_FILE))
+ return NULL;
+ }
+ if (!get_page_unless_zero(page))
+ return NULL;
+
+ return page;
+}
+
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ pte_t ptent, swp_entry_t *entry)
+{
+ struct page *page = NULL;
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!(mc.flags & MOVE_ANON))
+ return NULL;
+
+ /*
+ * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+ * a device and because they are not accessible by CPU they are store
+ * as special swap entry in the CPU page table.
+ */
+ if (is_device_private_entry(ent)) {
+ page = device_private_entry_to_page(ent);
+ /*
+ * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+ * a refcount of 1 when free (unlike normal page)
+ */
+ if (!page_ref_add_unless(page, 1, 1))
+ return NULL;
+ return page;
+ }
+
+ if (non_swap_entry(ent))
+ return NULL;
+
+ /*
+ * Because lookup_swap_cache() updates some statistics counter,
+ * we call find_get_page() with swapper_space directly.
+ */
+ page = find_get_page(swap_address_space(ent), swp_offset(ent));
+ entry->val = ent.val;
+
+ return page;
+}
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ pte_t ptent, swp_entry_t *entry)
+{
+ return NULL;
+}
+#endif
+
+static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ if (!vma->vm_file) /* anonymous vma */
+ return NULL;
+ if (!(mc.flags & MOVE_FILE))
+ return NULL;
+
+ /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* shmem/tmpfs may report page out on swap: account for that too. */
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+}
+
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @compound: charge the page as compound or small page
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ bool compound,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
+ int ret;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(compound && !PageTransHuge(page));
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at
+ * page->mem_cgroup of its source page while we change it.
+ */
+ ret = -EBUSY;
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(from, pgdat);
+ to_vec = mem_cgroup_lruvec(to, pgdat);
+
+ lock_page_memcg(page);
+
+ if (PageAnon(page)) {
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
+ if (PageTransHuge(page)) {
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
+ }
+
+ }
+ } else {
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
+
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
+ }
+
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
+ }
+
+ if (PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_can_writeback(mapping)) {
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
+ nr_pages);
+ }
+ }
+ }
+
+ if (PageWriteback(page)) {
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
+ }
+
+ /*
+ * All state has been migrated, let's switch to the new memcg.
+ *
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
+ */
+ smp_mb();
+
+ css_get(&to->css);
+ css_put(&from->css);
+
+ page->mem_cgroup = to;
+
+ __unlock_page_memcg(from);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
+/**
+ * get_mctgt_type - get target type of moving charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ * move charge. if @target is not NULL, the page is stored in target->page
+ * with extra refcnt got(Callers should handle it).
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
+ *
+ * Called with pte lock held.
+ */
+
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, union mc_target *target)
+{
+ struct page *page = NULL;
+ enum mc_target_type ret = MC_TARGET_NONE;
+ swp_entry_t ent = { .val = 0 };
+
+ if (pte_present(ptent))
+ page = mc_handle_present_pte(vma, addr, ptent);
+ else if (is_swap_pte(ptent))
+ page = mc_handle_swap_pte(vma, ptent, &ent);
+ else if (pte_none(ptent))
+ page = mc_handle_file_pte(vma, addr, ptent, &ent);
+
+ if (!page && !ent.val)
+ return ret;
+ if (page) {
+ /*
+ * Do only loose check w/o serialization.
+ * mem_cgroup_move_account() checks the page is valid or
+ * not under LRU exclusion.
+ */
+ if (page->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (is_device_private_page(page))
+ ret = MC_TARGET_DEVICE;
+ if (target)
+ target->page = page;
+ }
+ if (!ret || !target)
+ put_page(page);
+ }
+ /*
+ * There is a swap entry and a page doesn't exist or isn't charged.
+ * But we cannot move a tail-page in a THP.
+ */
+ if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
+ ret = MC_TARGET_SWAP;
+ if (target)
+ target->ent = ent;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ struct page *page = NULL;
+ enum mc_target_type ret = MC_TARGET_NONE;
+
+ if (unlikely(is_swap_pmd(pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(pmd));
+ return ret;
+ }
+ page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ if (!(mc.flags & MOVE_ANON))
+ return ret;
+ if (page->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target) {
+ get_page(page);
+ target->page = page;
+ }
+ }
+ return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ return MC_TARGET_NONE;
+}
+#endif
+
+static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ /*
+ * Note their can not be MC_TARGET_DEVICE for now as we do not
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
+ */
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+ mc.precharge += HPAGE_PMD_NR;
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ if (get_mctgt_type(vma, addr, *pte, NULL))
+ mc.precharge++; /* increment precharge temporarily */
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
+static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
+{
+ unsigned long precharge;
+
+ mmap_read_lock(mm);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ mmap_read_unlock(mm);
+
+ precharge = mc.precharge;
+ mc.precharge = 0;
+
+ return precharge;
+}
+
+static int mem_cgroup_precharge_mc(struct mm_struct *mm)
+{
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
+}
+
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
+ /* we must uncharge all the leftover precharges from mc.to */
+ if (mc.precharge) {
+ cancel_charge(mc.to, mc.precharge);
+ mc.precharge = 0;
+ }
+ /*
+ * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
+ * we must uncharge here.
+ */
+ if (mc.moved_charge) {
+ cancel_charge(mc.from, mc.moved_charge);
+ mc.moved_charge = 0;
+ }
+ /* we must fixup refcnts and charges */
+ if (mc.moved_swap) {
+ /* uncharge swap account from the old cgroup */
+ if (!mem_cgroup_is_root(mc.from))
+ page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
+
+ mem_cgroup_id_put_many(mc.from, mc.moved_swap);
+
+ /*
+ * we charged both to->memory and to->memsw, so we
+ * should uncharge to->memory.
+ */
+ if (!mem_cgroup_is_root(mc.to))
+ page_counter_uncharge(&mc.to->memory, mc.moved_swap);
+
+ mc.moved_swap = 0;
+ }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mm_struct *mm = mc.mm;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
+ spin_lock(&mc.lock);
+ mc.from = NULL;
+ mc.to = NULL;
+ mc.mm = NULL;
+ spin_unlock(&mc.lock);
+
+ mmput(mm);
+}
+
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
+ struct mem_cgroup *from;
+ struct task_struct *leader, *p;
+ struct mm_struct *mm;
+ unsigned long move_flags;
+ int ret = 0;
+
+ /* charge immigration isn't supported on the default hierarchy */
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return 0;
+
+ /*
+ * Multi-process migrations only happen on the default hierarchy
+ * where charge immigration is not used. Perform charge
+ * immigration if @tset contains a leader and whine if there are
+ * multiple.
+ */
+ p = NULL;
+ cgroup_taskset_for_each_leader(leader, css, tset) {
+ WARN_ON_ONCE(p);
+ p = leader;
+ memcg = mem_cgroup_from_css(css);
+ }
+ if (!p)
+ return 0;
+
+ /*
+ * We are now commited to this value whatever it is. Changes in this
+ * tunable will only affect upcoming migrations, not the current one.
+ * So we need to save it, and keep it going.
+ */
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
+ if (!move_flags)
+ return 0;
+
+ from = mem_cgroup_from_task(p);
+
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+
+ spin_lock(&mc.lock);
+ mc.mm = mm;
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
+ } else {
+ mmput(mm);
+ }
+ return ret;
+}
+
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ if (mc.to)
+ mem_cgroup_clear_mc();
+}
+
+static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+ enum mc_target_type target_type;
+ union mc_target target;
+ struct page *page;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (mc.precharge < HPAGE_PMD_NR) {
+ spin_unlock(ptl);
+ return 0;
+ }
+ target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+ if (target_type == MC_TARGET_PAGE) {
+ page = target.page;
+ if (!isolate_lru_page(page)) {
+ if (!mem_cgroup_move_account(page, true,
+ mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ putback_lru_page(page);
+ }
+ put_page(page);
+ } else if (target_type == MC_TARGET_DEVICE) {
+ page = target.page;
+ if (!mem_cgroup_move_account(page, true,
+ mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ put_page(page);
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+retry:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; addr += PAGE_SIZE) {
+ pte_t ptent = *(pte++);
+ bool device = false;
+ swp_entry_t ent;
+
+ if (!mc.precharge)
+ break;
+
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_DEVICE:
+ device = true;
+ fallthrough;
+ case MC_TARGET_PAGE:
+ page = target.page;
+ /*
+ * We can have a part of the split pmd here. Moving it
+ * can be done but it would be too convoluted so simply
+ * ignore such a partial THP and keep it in original
+ * memcg. There should be somebody mapping the head.
+ */
+ if (PageTransCompound(page))
+ goto put;
+ if (!device && isolate_lru_page(page))
+ goto put;
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
+ mc.precharge--;
+ /* we uncharge from mc.from later. */
+ mc.moved_charge++;
+ }
+ if (!device)
+ putback_lru_page(page);
+put: /* get_mctgt_type() gets the page */
+ put_page(page);
+ break;
+ case MC_TARGET_SWAP:
+ ent = target.ent;
+ if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
+ mc.precharge--;
+ mem_cgroup_id_get_many(mc.to, 1);
+ /* we fixup other refcnts and charges later. */
+ mc.moved_swap++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (addr != end) {
+ /*
+ * We have consumed all precharges we got in can_attach().
+ * We try charge one by one, but don't do any additional
+ * charges to mc.to if we have failed in charge once in attach()
+ * phase.
+ */
+ ret = mem_cgroup_do_precharge(1);
+ if (!ret)
+ goto retry;
+ }
+
+ return ret;
+}
+
+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
+static void mem_cgroup_move_charge(void)
+{
+ lru_add_drain_all();
+ /*
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
+ */
+ atomic_inc(&mc.from->moving_account);
+ synchronize_rcu();
+retry:
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
+ /*
+ * Someone who are holding the mmap_lock might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
+ /*
+ * When we have consumed all precharges and failed in doing
+ * additional charge, the page walk just aborts.
+ */
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
+
+ mmap_read_unlock(mc.mm);
+ atomic_dec(&mc.from->moving_account);
+}
+
+static void mem_cgroup_move_task(void)
+{
+ if (mc.to) {
+ mem_cgroup_move_charge();
+ mem_cgroup_clear_mc();
+ }
+}
+#else /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+}
+static void mem_cgroup_move_task(void)
+{
+}
+#endif
+
+/*
+ * Cgroup retains root cgroups across [un]mount cycles making it necessary
+ * to verify whether we're attached to the default hierarchy on each mount
+ * attempt.
+ */
+static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
+{
+ /*
+ * use_hierarchy is forced on the default hierarchy. cgroup core
+ * guarantees that @root doesn't have any children, so turning it
+ * on for the root memcg is enough.
+ */
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ root_mem_cgroup->use_hierarchy = true;
+ else
+ root_mem_cgroup->use_hierarchy = false;
+}
+
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
+}
+
+static int memory_min_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
+static int memory_low_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
+}
+
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long low;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &low);
+ if (err)
+ return err;
+
+ page_counter_set_low(&memcg->memory, low);
+
+ return nbytes;
+}
+
+static int memory_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
+}
+
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ bool drained = false;
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->memory, high);
+
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long reclaimed;
+
+ if (nr_pages <= high)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ break;
+ }
+
+ memcg_wb_domain_size_changed(memcg);
+ return nbytes;
+}
+
+static int memory_max_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
+}
+
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
+ bool drained = false;
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ xchg(&memcg->memory.max, max);
+
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+ if (nr_pages <= max)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ if (nr_reclaims) {
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+ GFP_KERNEL, true))
+ nr_reclaims--;
+ continue;
+ }
+
+ memcg_memory_event(memcg, MEMCG_OOM);
+ if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ break;
+ }
+
+ memcg_wb_domain_size_changed(memcg);
+ return nbytes;
+}
+
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
+static int memory_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ __memory_events_show(m, memcg->memory_events);
+ return 0;
+}
+
+static int memory_events_local_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ __memory_events_show(m, memcg->memory_events_local);
+ return 0;
+}
+
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
+
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ seq_printf(m, "%d\n", memcg->oom_group);
+
+ return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret, oom_group;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ ret = kstrtoint(buf, 0, &oom_group);
+ if (ret)
+ return ret;
+
+ if (oom_group != 0 && oom_group != 1)
+ return -EINVAL;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
+static struct cftype memory_files[] = {
+ {
+ .name = "current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = memory_current_read,
+ },
+ {
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_low_show,
+ .write = memory_low_write,
+ },
+ {
+ .name = "high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_high_show,
+ .write = memory_high_write,
+ },
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_max_show,
+ .write = memory_max_write,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_file),
+ .seq_show = memory_events_show,
+ },
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
+ .name = "stat",
+ .seq_show = memory_stat_show,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
+ {
+ .name = "oom.group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys memory_cgrp_subsys = {
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
+ .css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
+ .css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .post_attach = mem_cgroup_move_task,
+ .bind = mem_cgroup_bind,
+ .dfl_cftypes = memory_files,
+ .legacy_cftypes = mem_cgroup_legacy_files,
+ .early_init = 0,
+};
+
+/*
+ * This function calculates an individual cgroup's effective
+ * protection which is derived from its own memory.min/low, its
+ * parent's and siblings' settings, as well as the actual memory
+ * distribution in the tree.
+ *
+ * The following rules apply to the effective protection values:
+ *
+ * 1. At the first level of reclaim, effective protection is equal to
+ * the declared protection in memory.min and memory.low.
+ *
+ * 2. To enable safe delegation of the protection configuration, at
+ * subsequent levels the effective protection is capped to the
+ * parent's effective protection.
+ *
+ * 3. To make complex and dynamic subtrees easier to configure, the
+ * user is allowed to overcommit the declared protection at a given
+ * level. If that is the case, the parent's effective protection is
+ * distributed to the children in proportion to how much protection
+ * they have declared and how much of it they are utilizing.
+ *
+ * This makes distribution proportional, but also work-conserving:
+ * if one cgroup claims much more protection than it uses memory,
+ * the unused remainder is available to its siblings.
+ *
+ * 4. Conversely, when the declared protection is undercommitted at a
+ * given level, the distribution of the larger parental protection
+ * budget is NOT proportional. A cgroup's protection from a sibling
+ * is capped to its own memory.min/low setting.
+ *
+ * 5. However, to allow protecting recursive subtrees from each other
+ * without having to declare each individual cgroup's fixed share
+ * of the ancestor's claim to protection, any unutilized -
+ * "floating" - protection from up the tree is distributed in
+ * proportion to each cgroup's *usage*. This makes the protection
+ * neutral wrt sibling cgroups and lets them compete freely over
+ * the shared parental protection budget, but it protects the
+ * subtree as a whole from neighboring subtrees.
+ *
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
+ * against immediate siblings whereas 5. is about protecting against
+ * neighboring subtrees.
+ */
+static unsigned long effective_protection(unsigned long usage,
+ unsigned long parent_usage,
+ unsigned long setting,
+ unsigned long parent_effective,
+ unsigned long siblings_protected)
+{
+ unsigned long protected;
+ unsigned long ep;
+
+ protected = min(usage, setting);
+ /*
+ * If all cgroups at this level combined claim and use more
+ * protection then what the parent affords them, distribute
+ * shares in proportion to utilization.
+ *
+ * We are using actual utilization rather than the statically
+ * claimed protection in order to be work-conserving: claimed
+ * but unused protection is available to siblings that would
+ * otherwise get a smaller chunk than what they claimed.
+ */
+ if (siblings_protected > parent_effective)
+ return protected * parent_effective / siblings_protected;
+
+ /*
+ * Ok, utilized protection of all children is within what the
+ * parent affords them, so we know whatever this child claims
+ * and utilizes is effectively protected.
+ *
+ * If there is unprotected usage beyond this value, reclaim
+ * will apply pressure in proportion to that amount.
+ *
+ * If there is unutilized protection, the cgroup will be fully
+ * shielded from reclaim, but we do return a smaller value for
+ * protection than what the group could enjoy in theory. This
+ * is okay. With the overcommit distribution above, effective
+ * protection is always dependent on how memory is actually
+ * consumed among the siblings anyway.
+ */
+ ep = protected;
+
+ /*
+ * If the children aren't claiming (all of) the protection
+ * afforded to them by the parent, distribute the remainder in
+ * proportion to the (unprotected) memory of each cgroup. That
+ * way, cgroups that aren't explicitly prioritized wrt each
+ * other compete freely over the allowance, but they are
+ * collectively protected from neighboring trees.
+ *
+ * We're using unprotected memory for the weight so that if
+ * some cgroups DO claim explicit protection, we don't protect
+ * the same bytes twice.
+ *
+ * Check both usage and parent_usage against the respective
+ * protected values. One should imply the other, but they
+ * aren't read atomically - make sure the division is sane.
+ */
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
+ return ep;
+ if (parent_effective > siblings_protected &&
+ parent_usage > siblings_protected &&
+ usage > protected) {
+ unsigned long unclaimed;
+
+ unclaimed = parent_effective - siblings_protected;
+ unclaimed *= usage - protected;
+ unclaimed /= parent_usage - siblings_protected;
+
+ ep += unclaimed;
+ }
+
+ return ep;
+}
+
+/**
+ * mem_cgroup_protected - check if memory consumption is in the normal range
+ * @root: the top ancestor of the sub-tree being checked
+ * @memcg: the memory cgroup to check
+ *
+ * WARNING: This function is not stateless! It can only be used as part
+ * of a top-down tree iteration, not for isolated queries.
+ */
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
+{
+ unsigned long usage, parent_usage;
+ struct mem_cgroup *parent;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!root)
+ root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
+ if (memcg == root)
+ return;
+
+ usage = page_counter_read(&memcg->memory);
+ if (!usage)
+ return;
+
+ parent = parent_mem_cgroup(memcg);
+ /* No parent means a non-hierarchical mode on v1 memcg */
+ if (!parent)
+ return;
+
+ if (parent == root) {
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
+ return;
+ }
+
+ parent_usage = page_counter_read(&parent->memory);
+
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
+ READ_ONCE(memcg->memory.min),
+ READ_ONCE(parent->memory.emin),
+ atomic_long_read(&parent->memory.children_min_usage)));
+
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
+ READ_ONCE(memcg->memory.low),
+ READ_ONCE(parent->memory.elow),
+ atomic_long_read(&parent->memory.children_low_usage)));
+}
+
+/**
+ * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+{
+ unsigned int nr_pages = thp_nr_pages(page);
+ struct mem_cgroup *memcg = NULL;
+ int ret = 0;
+
+ if (mem_cgroup_disabled())
+ goto out;
+
+ if (PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id;
+
+ /*
+ * Every swap fault against a single page tries to charge the
+ * page, bail as early as possible. shmem_unuse() encounters
+ * already charged pages, too. page->mem_cgroup is protected
+ * by the page lock, which serializes swap cache removal, which
+ * in turn serializes uncharging.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (compound_head(page)->mem_cgroup)
+ goto out;
+
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
+
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(mm);
+
+ ret = try_charge(memcg, gfp_mask, nr_pages);
+ if (ret)
+ goto out_put;
+
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ memcg_check_events(memcg, page);
+ local_irq_enable();
+
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account() && PageSwapCache(page)) {
+ swp_entry_t entry = { .val = page_private(page) };
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry, nr_pages);
+ }
+
+out_put:
+ css_put(&memcg->css);
+out:
+ return ret;
+}
+
+struct uncharge_gather {
+ struct mem_cgroup *memcg;
+ unsigned long nr_pages;
+ unsigned long pgpgout;
+ unsigned long nr_kmem;
+ struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+ memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+ unsigned long flags;
+
+ if (!mem_cgroup_is_root(ug->memcg)) {
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+ page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ memcg_oom_recover(ug->memcg);
+ }
+
+ local_irq_save(flags);
+ __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+ memcg_check_events(ug->memcg, ug->dummy_page);
+ local_irq_restore(flags);
+
+ /* drop reference from uncharge_page */
+ css_put(&ug->memcg->css);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ unsigned long nr_pages;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ if (!page->mem_cgroup)
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page->mem_cgroup at this point, we have fully
+ * exclusive access to the page.
+ */
+
+ if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->memcg = page->mem_cgroup;
+
+ /* pairs with css_put in uncharge_batch */
+ css_get(&ug->memcg->css);
+ }
+
+ nr_pages = compound_nr(page);
+ ug->nr_pages += nr_pages;
+
+ if (!PageKmemcg(page)) {
+ ug->pgpgout++;
+ } else {
+ ug->nr_kmem += nr_pages;
+ __ClearPageKmemcg(page);
+ }
+
+ ug->dummy_page = page;
+ page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
+}
+
+static void uncharge_list(struct list_head *page_list)
+{
+ struct uncharge_gather ug;
+ struct list_head *next;
+
+ uncharge_gather_clear(&ug);
+
+ /*
+ * Note that the list can be a single page->lru; hence the
+ * do-while loop instead of a simple list_for_each_entry().
+ */
+ next = page_list->next;
+ do {
+ struct page *page;
+
+ page = list_entry(next, struct page, lru);
+ next = page->lru.next;
+
+ uncharge_page(page, &ug);
+ } while (next != page_list);
+
+ if (ug.memcg)
+ uncharge_batch(&ug);
+}
+
+/**
+ * mem_cgroup_uncharge - uncharge a page
+ * @page: page to uncharge
+ *
+ * Uncharge a page previously charged with mem_cgroup_charge().
+ */
+void mem_cgroup_uncharge(struct page *page)
+{
+ struct uncharge_gather ug;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /* Don't touch page->lru of any random page, pre-check: */
+ if (!page->mem_cgroup)
+ return;
+
+ uncharge_gather_clear(&ug);
+ uncharge_page(page, &ug);
+ uncharge_batch(&ug);
+}
+
+/**
+ * mem_cgroup_uncharge_list - uncharge a list of page
+ * @page_list: list of pages to uncharge
+ *
+ * Uncharge a list of pages previously charged with
+ * mem_cgroup_charge().
+ */
+void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!list_empty(page_list))
+ uncharge_list(page_list);
+}
+
+/**
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
+ *
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
+ *
+ * Both pages must be locked, @newpage->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+ VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
+ newpage);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ /* Page cache replacement: new page already charged? */
+ if (newpage->mem_cgroup)
+ return;
+
+ /* Swapcache readahead pages can get replaced before being charged */
+ memcg = oldpage->mem_cgroup;
+ if (!memcg)
+ return;
+
+ /* Force-charge the new page. The old one will be freed soon */
+ nr_pages = thp_nr_pages(newpage);
+
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+
+ css_get(&memcg->css);
+ commit_charge(newpage, memcg);
+
+ local_irq_save(flags);
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
+ memcg_check_events(memcg, newpage);
+ local_irq_restore(flags);
+}
+
+DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
+EXPORT_SYMBOL(memcg_sockets_enabled_key);
+
+void mem_cgroup_sk_alloc(struct sock *sk)
+{
+ struct mem_cgroup *memcg;
+
+ if (!mem_cgroup_sockets_enabled)
+ return;
+
+ /* Do not associate the sock with unrelated interrupted task's memcg. */
+ if (in_interrupt())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg == root_mem_cgroup)
+ goto out;
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
+ goto out;
+ if (css_tryget(&memcg->css))
+ sk->sk_memcg = memcg;
+out:
+ rcu_read_unlock();
+}
+
+void mem_cgroup_sk_free(struct sock *sk)
+{
+ if (sk->sk_memcg)
+ css_put(&sk->sk_memcg->css);
+}
+
+/**
+ * mem_cgroup_charge_skmem - charge socket memory
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Charges @nr_pages to @memcg. Returns %true if the charge fit within
+ * @memcg's configured limit, %false if the charge had to be forced.
+ */
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ struct page_counter *fail;
+
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
+ return true;
+ }
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
+ return false;
+ }
+
+ /* Don't block in the packet receive path */
+ if (in_softirq())
+ gfp_mask = GFP_NOWAIT;
+
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
+
+ if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ return true;
+
+ try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
+ return false;
+}
+
+/**
+ * mem_cgroup_uncharge_skmem - uncharge socket memory
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
+ return;
+ }
+
+ mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
+
+ refill_stock(memcg, nr_pages);
+}
+
+static int __init cgroup_memory(char *s)
+{
+ char *token;
+
+ while ((token = strsep(&s, ",")) != NULL) {
+ if (!*token)
+ continue;
+ if (!strcmp(token, "nosocket"))
+ cgroup_memory_nosocket = true;
+ if (!strcmp(token, "nokmem"))
+ cgroup_memory_nokmem = true;
+ }
+ return 1;
+}
+__setup("cgroup.memory=", cgroup_memory);
+
+/*
+ * subsys_initcall() for memory controller.
+ *
+ * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
+ * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
+ * basically everything that doesn't depend on a specific mem_cgroup structure
+ * should be initialized from here.
+ */
+static int __init mem_cgroup_init(void)
+{
+ int cpu, node;
+
+ cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
+ memcg_hotplug_cpu_dead);
+
+ for_each_possible_cpu(cpu)
+ INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+ drain_local_stock);
+
+ for_each_node(node) {
+ struct mem_cgroup_tree_per_node *rtpn;
+
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE);
+
+ rtpn->rb_root = RB_ROOT;
+ rtpn->rb_rightmost = NULL;
+ spin_lock_init(&rtpn->lock);
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+ }
+
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return;
+
+ /*
+ * In case the memcg owning these pages has been offlined and doesn't
+ * have an ID allocated to it anymore, charge the closest online
+ * ancestor for the swap instead and transfer the memory+swap charge.
+ */
+ swap_memcg = mem_cgroup_id_get_online(memcg);
+ nr_entries = thp_nr_pages(page);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+ nr_entries);
+ VM_BUG_ON_PAGE(oldid, page);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+
+ page->mem_cgroup = NULL;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, nr_entries);
+
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
+ if (!mem_cgroup_is_root(swap_memcg))
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
+ }
+
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * i_pages lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for updating the per-CPU variables.
+ */
+ VM_BUG_ON(!irqs_disabled());
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
+ memcg_check_events(memcg, page);
+
+ css_put(&memcg->css);
+}
+
+/**
+ * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @page's memcg for the swap space at @entry.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ unsigned int nr_pages = thp_nr_pages(page);
+ struct page_counter *counter;
+ struct mem_cgroup *memcg;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!entry.val) {
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return 0;
+ }
+
+ memcg = mem_cgroup_id_get_online(memcg);
+
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ mem_cgroup_id_put(memcg);
+ return -ENOMEM;
+ }
+
+ /* Get references for the tail pages, too */
+ if (nr_pages > 1)
+ mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
+ VM_BUG_ON_PAGE(oldid, page);
+ mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+
+ return 0;
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge swap space
+ * @entry: swap entry to uncharge
+ * @nr_pages: the amount of swap space to uncharge
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ id = swap_cgroup_record(entry, 0, nr_pages);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg) {
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, nr_pages);
+ else
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ }
+ mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
+ mem_cgroup_id_put_many(memcg, nr_pages);
+ }
+ rcu_read_unlock();
+}
+
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.max) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
+ return true;
+ }
+
+ return false;
+}
+
+static int __init setup_swap_account(char *s)
+{
+ if (!strcmp(s, "1"))
+ cgroup_memory_noswap = 0;
+ else if (!strcmp(s, "0"))
+ cgroup_memory_noswap = 1;
+ return 1;
+}
+__setup("swapaccount=", setup_swap_account);
+
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ xchg(&memcg->swap.max, max);
+
+ return nbytes;
+}
+
+static int swap_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
+ seq_printf(m, "max %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
+ seq_printf(m, "fail %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
+
+ return 0;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ {
+ .name = "swap.events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, swap_events_file),
+ .seq_show = swap_events_show,
+ },
+ { } /* terminate */
+};
+
+static struct cftype memsw_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+
+/*
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
+ * boot parameter. This may result in premature OOPS inside
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
+ */
+static int __init mem_cgroup_swap_init(void)
+{
+ /* No memory control -> no swap control */
+ if (mem_cgroup_disabled())
+ cgroup_memory_noswap = true;
+
+ if (cgroup_memory_noswap)
+ return 0;
+
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
+
+ return 0;
+}
+core_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000..278e56366
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,346 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every xa_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on tmpfs
+ * or hugetlbfs because they are memory only filesystems.
+ */
+#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void memfd_tag_pins(struct xa_state *xas)
+{
+ struct page *page;
+ int latency = 0;
+ int cache_count;
+
+ lru_add_drain();
+
+ xas_lock_irq(xas);
+ xas_for_each(xas, page, ULONG_MAX) {
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) &&
+ page_count(page) - total_mapcount(page) != cache_count)
+ xas_set_mark(xas, MEMFD_TAG_PINNED);
+ if (cache_count != 1)
+ xas_set(xas, page->index + cache_count);
+
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
+ continue;
+ latency = 0;
+
+ xas_pause(xas);
+ xas_unlock_irq(xas);
+ cond_resched();
+ xas_lock_irq(xas);
+ }
+ xas_unlock_irq(xas);
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int memfd_wait_for_pins(struct address_space *mapping)
+{
+ XA_STATE(xas, &mapping->i_pages, 0);
+ struct page *page;
+ int error, scan;
+
+ memfd_tag_pins(&xas);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ int latency = 0;
+ int cache_count;
+
+ if (!xas_marked(&xas, MEMFD_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ xas_set(&xas, 0);
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ bool clear = true;
+
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) && cache_count !=
+ page_count(page) - total_mapcount(page)) {
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ if (scan == LAST_SCAN)
+ error = -EBUSY;
+ else
+ clear = false;
+ }
+ if (clear)
+ xas_clear_mark(&xas, MEMFD_TAG_PINNED);
+
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
+ continue;
+ latency = 0;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+ }
+
+ return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (is_file_hugepages(file))
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int *file_seals;
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+ * but restrict access to a specific subset of file operations. Seals
+ * can only be added, but never removed. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ * A malicious peer can thus never perform unwanted operations on a
+ * shared object.
+ *
+ * Seals are only supported on special tmpfs or hugetlbfs files and
+ * always affect the whole underlying inode. Once a seal is set, it
+ * may prevent some kinds of access to the file. Currently, the
+ * following seals are defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous tmpfs and hugetlbfs files support sealing. More
+ * importantly, seals are never written to disk. Therefore, there's
+ * no plan to support it on other file types.
+ */
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ inode_lock(inode);
+
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = memfd_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ *file_seals |= seals;
+ error = 0;
+
+unlock:
+ inode_unlock(inode);
+ return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+ unsigned int *seals = memfd_file_seals_ptr(file);
+
+ return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = memfd_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = memfd_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_LARGEFILE;
+
+ if (flags & MFD_ALLOW_SEALING) {
+ file_seals = memfd_file_seals_ptr(file);
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
+ }
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000..f320ff02c
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,1936 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
+ * failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ * tools/vm/page-types when running a real workload.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/kernel-page-flags.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/export.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/suspend.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/memremap.h>
+#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
+#include <linux/page-isolation.h>
+#include "internal.h"
+#include "ras/ras_event.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
+
+#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+
+u32 hwpoison_filter_enable = 0;
+u32 hwpoison_filter_dev_major = ~0U;
+u32 hwpoison_filter_dev_minor = ~0U;
+u64 hwpoison_filter_flags_mask;
+u64 hwpoison_filter_flags_value;
+EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
+
+static int hwpoison_filter_dev(struct page *p)
+{
+ struct address_space *mapping;
+ dev_t dev;
+
+ if (hwpoison_filter_dev_major == ~0U &&
+ hwpoison_filter_dev_minor == ~0U)
+ return 0;
+
+ /*
+ * page_mapping() does not accept slab pages.
+ */
+ if (PageSlab(p))
+ return -EINVAL;
+
+ mapping = page_mapping(p);
+ if (mapping == NULL || mapping->host == NULL)
+ return -EINVAL;
+
+ dev = mapping->host->i_sb->s_dev;
+ if (hwpoison_filter_dev_major != ~0U &&
+ hwpoison_filter_dev_major != MAJOR(dev))
+ return -EINVAL;
+ if (hwpoison_filter_dev_minor != ~0U &&
+ hwpoison_filter_dev_minor != MINOR(dev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int hwpoison_filter_flags(struct page *p)
+{
+ if (!hwpoison_filter_flags_mask)
+ return 0;
+
+ if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+ hwpoison_filter_flags_value)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef CONFIG_MEMCG
+u64 hwpoison_filter_memcg;
+EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
+static int hwpoison_filter_task(struct page *p)
+{
+ if (!hwpoison_filter_memcg)
+ return 0;
+
+ if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+ return -EINVAL;
+
+ return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+
+int hwpoison_filter(struct page *p)
+{
+ if (!hwpoison_filter_enable)
+ return 0;
+
+ if (hwpoison_filter_dev(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_flags(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_task(p))
+ return -EINVAL;
+
+ return 0;
+}
+#else
+int hwpoison_filter(struct page *p)
+{
+ return 0;
+}
+#endif
+
+EXPORT_SYMBOL_GPL(hwpoison_filter);
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+ struct list_head nd;
+ struct task_struct *tsk;
+ unsigned long addr;
+ short size_shift;
+};
+
+/*
+ * Send all the processes who have the page mapped a signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
+ */
+static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
+{
+ struct task_struct *t = tk->tsk;
+ short addr_lsb = tk->size_shift;
+ int ret = 0;
+
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
+
+ if (flags & MF_ACTION_REQUIRED) {
+ WARN_ON_ONCE(t != current);
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)tk->addr, addr_lsb);
+ } else {
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully no one will do that?
+ */
+ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
+ addr_lsb, t); /* synchronous? */
+ }
+ if (ret < 0)
+ pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
+ return ret;
+}
+
+/*
+ * When a unknown page type is encountered drain as many buffers as possible
+ * in the hope to turn the page into a LRU or free page, which we can handle.
+ */
+void shake_page(struct page *p, int access)
+{
+ if (PageHuge(p))
+ return;
+
+ if (!PageSlab(p)) {
+ lru_add_drain_all();
+ if (PageLRU(p))
+ return;
+ drain_all_pages(page_zone(p));
+ if (PageLRU(p) || is_free_buddy_page(p))
+ return;
+ }
+
+ /*
+ * Only call shrink_node_slabs here (which would also shrink
+ * other caches) if access is not potentially fatal.
+ */
+ if (access)
+ drop_slab_node(page_to_nid(p));
+}
+EXPORT_SYMBOL_GPL(shake_page);
+
+static unsigned long dev_pagemap_mapping_shift(struct page *page,
+ struct vm_area_struct *vma)
+{
+ unsigned long address = vma_address(page, vma);
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return 0;
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return 0;
+ if (pud_devmap(*pud))
+ return PUD_SHIFT;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+ if (pmd_devmap(*pmd))
+ return PMD_SHIFT;
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if (pte_devmap(*pte))
+ return PAGE_SHIFT;
+ return 0;
+}
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do. We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill)
+{
+ struct to_kill *tk;
+
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ pr_err("Memory failure: Out of memory while machine check handling\n");
+ return;
+ }
+
+ tk->addr = page_address_in_vma(p, vma);
+ if (is_zone_device_page(p))
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+ else
+ tk->size_shift = page_shift(compound_head(p));
+
+ /*
+ * Send SIGKILL if "tk->addr == -EFAULT". Also, as
+ * "tk->size_shift" is always non-zero for !is_zone_device_page(),
+ * so "tk->size_shift == 0" effectively checks no mapping on
+ * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
+ * to a process' address space, it's possible not all N VMAs
+ * contain mappings for the page, but at least one VMA does.
+ * Only deliver SIGBUS with payload derived from the VMA that
+ * has a mapping for the page.
+ */
+ if (tk->addr == -EFAULT) {
+ pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+ page_to_pfn(p), tsk->comm);
+ } else if (tk->size_shift == 0) {
+ kfree(tk);
+ return;
+ }
+
+ get_task_struct(tsk);
+ tk->tsk = tsk;
+ list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
+ unsigned long pfn, int flags)
+{
+ struct to_kill *tk, *next;
+
+ list_for_each_entry_safe (tk, next, to_kill, nd) {
+ if (forcekill) {
+ /*
+ * In case something went wrong with munmapping
+ * make sure the process doesn't catch the
+ * signal and then access the memory. Just kill it.
+ */
+ if (fail || tk->addr == -EFAULT) {
+ pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
+ }
+
+ /*
+ * In theory the process could have mapped
+ * something else on the address in-between. We could
+ * check for that, but we need to tell the
+ * process anyways.
+ */
+ else if (kill_proc(tk, pfn, flags) < 0)
+ pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ }
+ put_task_struct(tk->tsk);
+ kfree(tk);
+ }
+}
+
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
+{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t) {
+ if (t->flags & PF_MCE_PROCESS) {
+ if (t->flags & PF_MCE_EARLY)
+ return t;
+ } else {
+ if (sysctl_memory_failure_early_kill)
+ return t;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ *
+ * Note that the above is true for Action Optional case, but not for Action
+ * Required case where SIGBUS should sent only to the current thread.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ if (!tsk->mm)
+ return NULL;
+ if (force_early) {
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (tsk->mm == current->mm)
+ return current;
+ else
+ return NULL;
+ }
+ return find_early_kill_thread(tsk);
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+ int force_early)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct anon_vma *av;
+ pgoff_t pgoff;
+
+ av = page_lock_anon_vma_read(page);
+ if (av == NULL) /* Not actually mapped anymore */
+ return;
+
+ pgoff = page_to_pgoff(page);
+ read_lock(&tasklist_lock);
+ for_each_process (tsk) {
+ struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
+
+ if (!t)
+ continue;
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
+ vma = vmac->vma;
+ if (!page_mapped_in_vma(page, vma))
+ continue;
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ page_unlock_anon_vma_read(av);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+ int force_early)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ pgoff = page_to_pgoff(page);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, force_early);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
+ pgoff) {
+ /*
+ * Send early kill signal to tasks where a vma covers
+ * the page but the corrupted page is not necessarily
+ * mapped it in its pte.
+ * Assume applications who requested early kill want
+ * to be informed of all such data corruptions.
+ */
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
+{
+ if (!page->mapping)
+ return;
+
+ if (PageAnon(page))
+ collect_procs_anon(page, tokill, force_early);
+ else
+ collect_procs_file(page, tokill, force_early);
+}
+
+static const char *action_name[] = {
+ [MF_IGNORED] = "Ignored",
+ [MF_FAILED] = "Failed",
+ [MF_DELAYED] = "Delayed",
+ [MF_RECOVERED] = "Recovered",
+};
+
+static const char * const action_page_types[] = {
+ [MF_MSG_KERNEL] = "reserved kernel page",
+ [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [MF_MSG_SLAB] = "kernel slab page",
+ [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
+ [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
+ [MF_MSG_HUGE] = "huge page",
+ [MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
+ [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
+ [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
+ [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [MF_MSG_DIRTY_LRU] = "dirty LRU page",
+ [MF_MSG_CLEAN_LRU] = "clean LRU page",
+ [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
+ [MF_MSG_BUDDY] = "free buddy page",
+ [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
+ [MF_MSG_UNKNOWN] = "unknown page",
+};
+
+/*
+ * XXX: It is possible that a page is isolated from LRU cache,
+ * and then kept in swap cache or failed to remove from page cache.
+ * The page count will stop it from being freed by unpoison.
+ * Stress tests should be aware of this memory leak problem.
+ */
+static int delete_from_lru_cache(struct page *p)
+{
+ if (!isolate_lru_page(p)) {
+ /*
+ * Clear sensible page flags, so that the buddy system won't
+ * complain when the page is unpoison-and-freed.
+ */
+ ClearPageActive(p);
+ ClearPageUnevictable(p);
+
+ /*
+ * Poisoned page might never drop its ref count to 0 so we have
+ * to uncharge it manually from its memcg.
+ */
+ mem_cgroup_uncharge(p);
+
+ /*
+ * drop the page count elevated by isolate_lru_page()
+ */
+ put_page(p);
+ return 0;
+ }
+ return -EIO;
+}
+
+static int truncate_error_page(struct page *p, unsigned long pfn,
+ struct address_space *mapping)
+{
+ int ret = MF_FAILED;
+
+ if (mapping->a_ops->error_remove_page) {
+ int err = mapping->a_ops->error_remove_page(mapping, p);
+
+ if (err != 0) {
+ pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
+ pfn, err);
+ } else if (page_has_private(p) &&
+ !try_to_release_page(p, GFP_NOIO)) {
+ pr_info("Memory failure: %#lx: failed to release buffers\n",
+ pfn);
+ } else {
+ ret = MF_RECOVERED;
+ }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+ * This fails on dirty or anything with private pages
+ */
+ if (invalidate_inode_page(p))
+ ret = MF_RECOVERED;
+ else
+ pr_info("Memory failure: %#lx: Failed to invalidate\n",
+ pfn);
+ }
+
+ return ret;
+}
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+ return MF_IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+ pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ return MF_FAILED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+ struct address_space *mapping;
+
+ delete_from_lru_cache(p);
+
+ /*
+ * For anonymous pages we're done the only reference left
+ * should be the one m_f() holds.
+ */
+ if (PageAnon(p))
+ return MF_RECOVERED;
+
+ /*
+ * Now truncate the page in the page cache. This is really
+ * more like a "temporary hole punch"
+ * Don't do this for block devices when someone else
+ * has a reference, because it could be file system metadata
+ * and that's not safe to truncate.
+ */
+ mapping = page_mapping(p);
+ if (!mapping) {
+ /*
+ * Page has been teared down in the meanwhile
+ */
+ return MF_FAILED;
+ }
+
+ /*
+ * Truncation is a bit tricky. Enable it per file system for now.
+ *
+ * Open: to take i_mutex or not for this? Right now we don't.
+ */
+ return truncate_error_page(p, pfn, mapping);
+}
+
+/*
+ * Dirty pagecache page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+ struct address_space *mapping = page_mapping(p);
+
+ SetPageError(p);
+ /* TBD: print more information about the file. */
+ if (mapping) {
+ /*
+ * IO error will be reported by write(), fsync(), etc.
+ * who check the mapping.
+ * This way the application knows that something went
+ * wrong with its dirty file data.
+ *
+ * There's one open issue:
+ *
+ * The EIO will be only reported on the next IO
+ * operation and then cleared through the IO map.
+ * Normally Linux has two mechanisms to pass IO error
+ * first through the AS_EIO flag in the address space
+ * and then through the PageError flag in the page.
+ * Since we drop pages on memory failure handling the
+ * only mechanism open to use is through AS_AIO.
+ *
+ * This has the disadvantage that it gets cleared on
+ * the first operation that returns an error, while
+ * the PageError bit is more sticky and only cleared
+ * when the page is reread or dropped. If an
+ * application assumes it will always get error on
+ * fsync, but does other operations on the fd before
+ * and the page is dropped between then the error
+ * will not be properly reported.
+ *
+ * This can already happen even without hwpoisoned
+ * pages: first on metadata IO errors (which only
+ * report through AS_EIO) or when the page is dropped
+ * at the wrong time.
+ *
+ * So right now we assume that the application DTRT on
+ * the first EIO, but we're not worse than other parts
+ * of the kernel.
+ */
+ mapping_set_error(mapping, -EIO);
+ }
+
+ return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ * - clear dirty bit to prevent IO
+ * - remove from LRU
+ * - but keep in the swap cache, so that when we return to it on
+ * a later page fault, we know the application is accessing
+ * corrupted data and shall be killed (we installed simple
+ * interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+ ClearPageDirty(p);
+ /* Trigger EIO in shmem: */
+ ClearPageUptodate(p);
+
+ if (!delete_from_lru_cache(p))
+ return MF_DELAYED;
+ else
+ return MF_FAILED;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+ delete_from_swap_cache(p);
+
+ if (!delete_from_lru_cache(p))
+ return MF_RECOVERED;
+ else
+ return MF_FAILED;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+ int res = 0;
+ struct page *hpage = compound_head(p);
+ struct address_space *mapping;
+
+ if (!PageHuge(hpage))
+ return MF_DELAYED;
+
+ mapping = page_mapping(hpage);
+ if (mapping) {
+ res = truncate_error_page(hpage, pfn, mapping);
+ } else {
+ unlock_page(hpage);
+ /*
+ * migration entry prevents later access on error anonymous
+ * hugepage, so we can free and dissolve it into buddy to
+ * save healthy subpages.
+ */
+ if (PageAnon(hpage))
+ put_page(hpage);
+ dissolve_free_huge_page(p);
+ res = MF_RECOVERED;
+ lock_page(hpage);
+ }
+
+ return res;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremely careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty (1UL << PG_dirty)
+#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
+#define unevict (1UL << PG_unevictable)
+#define mlock (1UL << PG_mlocked)
+#define lru (1UL << PG_lru)
+#define head (1UL << PG_head)
+#define slab (1UL << PG_slab)
+#define reserved (1UL << PG_reserved)
+
+static struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+ int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+ { reserved, reserved, MF_MSG_KERNEL, me_kernel },
+ /*
+ * free pages are specially detected outside this table:
+ * PG_buddy pages only make a small fraction of all free pages.
+ */
+
+ /*
+ * Could in theory check if slab page is free or if we can drop
+ * currently unused objects without touching them. But just
+ * treat it as standard kernel for now.
+ */
+ { slab, slab, MF_MSG_SLAB, me_kernel },
+
+ { head, head, MF_MSG_HUGE, me_huge_page },
+
+ { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
+
+ { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
+
+ { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
+
+ { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
+
+ /*
+ * Catchall entry: must be at end.
+ */
+ { 0, 0, MF_MSG_UNKNOWN, me_unknown },
+};
+
+#undef dirty
+#undef sc
+#undef unevict
+#undef mlock
+#undef lru
+#undef head
+#undef slab
+#undef reserved
+
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
+{
+ trace_memory_failure_event(pfn, type, result);
+
+ pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+ pfn, action_page_types[type], action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+ unsigned long pfn)
+{
+ int result;
+ int count;
+
+ result = ps->action(p, pfn);
+
+ count = page_count(p) - 1;
+ if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
+ count--;
+ if (count > 0) {
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ pfn, action_page_types[ps->type], count);
+ result = MF_FAILED;
+ }
+ action_result(pfn, ps->type, result);
+
+ /* Could do more checks here if page looks ok */
+ /*
+ * Could adjust zone counters here to correct for the missing page.
+ */
+
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+}
+
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+static int get_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (!PageHuge(head) && PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch the "partially handled" page.
+ */
+ if (!PageAnon(head)) {
+ pr_err("Memory failure: %#lx: non anonymous thp\n",
+ page_to_pfn(page));
+ return 0;
+ }
+ }
+
+ if (get_page_unless_zero(head)) {
+ if (head == compound_head(page))
+ return 1;
+
+ pr_info("Memory failure: %#lx cannot catch tail\n",
+ page_to_pfn(page));
+ put_page(head);
+ }
+
+ return 0;
+}
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
+ int flags, struct page **hpagep)
+{
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK;
+ struct address_space *mapping;
+ LIST_HEAD(tokill);
+ bool unmap_success = true;
+ int kill = 1, forcekill;
+ struct page *hpage = *hpagep;
+ bool mlocked = PageMlocked(hpage);
+
+ /*
+ * Here we are interested only in user-mapped pages, so skip any
+ * other types of pages.
+ */
+ if (PageReserved(p) || PageSlab(p))
+ return true;
+ if (!(PageLRU(hpage) || PageHuge(p)))
+ return true;
+
+ /*
+ * This check implies we don't kill processes if their pages
+ * are in the swap cache early. Those are always late kills.
+ */
+ if (!page_mapped(p))
+ return true;
+
+ if (PageKsm(p)) {
+ pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+ return false;
+ }
+
+ if (PageSwapCache(p)) {
+ pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
+ pfn);
+ ttu |= TTU_IGNORE_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = page_mapping(hpage);
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
+ } else {
+ kill = 0;
+ ttu |= TTU_IGNORE_HWPOISON;
+ pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ /*
+ * First collect all the processes that have the page
+ * mapped in dirty form. This has to be done before try_to_unmap,
+ * because ttu takes the rmap data structures down.
+ *
+ * Error handling: We ignore errors here because
+ * there's nothing that can be done.
+ */
+ if (kill)
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+
+ if (!PageHuge(hpage)) {
+ unmap_success = try_to_unmap(hpage, ttu);
+ } else {
+ if (!PageAnon(hpage)) {
+ /*
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higer level.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (mapping) {
+ unmap_success = try_to_unmap(hpage,
+ ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else {
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ unmap_success = false;
+ }
+ } else {
+ unmap_success = try_to_unmap(p, ttu);
+ }
+ }
+ if (!unmap_success)
+ pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(p));
+
+ /*
+ * try_to_unmap() might put mlocked page in lru cache, so call
+ * shake_page() again to ensure that it's flushed.
+ */
+ if (mlocked)
+ shake_page(hpage, 0);
+
+ /*
+ * Now that the dirty bit has been propagated to the
+ * struct page and all unmaps done we can decide if
+ * killing is needed or not. Only kill when the page
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
+ * freed. When there was a problem unmapping earlier
+ * use a more force-full uncatchable kill to prevent
+ * any accesses to the poisoned memory.
+ */
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
+
+ return unmap_success;
+}
+
+static int identify_page_state(unsigned long pfn, struct page *p,
+ unsigned long page_flags)
+{
+ struct page_state *ps;
+
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flags is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
+ break;
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ return page_action(ps, p, pfn);
+}
+
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
+static int memory_failure_hugetlb(unsigned long pfn, int flags)
+{
+ struct page *p = pfn_to_page(pfn);
+ struct page *head = compound_head(p);
+ int res;
+ unsigned long page_flags;
+
+ if (TestSetPageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
+ return 0;
+ }
+
+ num_poisoned_pages_inc();
+
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(head);
+ if (PageHWPoison(head)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != head && TestSetPageHWPoison(head))) {
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ return 0;
+ }
+ }
+ unlock_page(head);
+ dissolve_free_huge_page(p);
+ action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
+ return 0;
+ }
+
+ lock_page(head);
+ page_flags = head->flags;
+
+ if (!PageHWPoison(head)) {
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
+ num_poisoned_pages_dec();
+ unlock_page(head);
+ put_page(head);
+ return 0;
+ }
+
+ /*
+ * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
+ * simply disable it. In order to make it work properly, we need
+ * make sure that:
+ * - conversion of a pud that maps an error hugetlb into hwpoison
+ * entry properly works, and
+ * - other mm code walking over page table is aware of pud-aligned
+ * hwpoison entries.
+ */
+ if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
+ action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ res = identify_page_state(pfn, p, page_flags);
+out:
+ unlock_page(head);
+ return res;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ const bool unmap_success = true;
+ unsigned long size = 0;
+ struct to_kill *tk;
+ LIST_HEAD(tokill);
+ int rc = -EBUSY;
+ loff_t start;
+ dax_entry_t cookie;
+
+ if (flags & MF_COUNT_INCREASED)
+ /*
+ * Drop the extra refcount in case we come from madvise().
+ */
+ put_page(page);
+
+ /* device metadata space is not recoverable */
+ if (!pgmap_pfn_valid(pgmap, pfn)) {
+ rc = -ENXIO;
+ goto out;
+ }
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ cookie = dax_lock_page(page);
+ if (!cookie)
+ goto out;
+
+ if (hwpoison_filter(page)) {
+ rc = 0;
+ goto unlock;
+ }
+
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ /*
+ * TODO: Handle HMM pages which may need coordination
+ * with device-side memory.
+ */
+ goto unlock;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+
+ list_for_each_entry(tk, &tokill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up
+ * device-dax mappings which are constant size. The
+ * actual size of the mapping being torn down is
+ * communicated in siginfo, see kill_proc()
+ */
+ start = (page->index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(page->mapping, start, size, 0);
+ }
+ kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+ rc = 0;
+unlock:
+ dax_unlock_page(page, cookie);
+out:
+ /* drop pgmap ref acquired in caller */
+ put_dev_pagemap(pgmap);
+ action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+ return rc;
+}
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int flags)
+{
+ struct page *p;
+ struct page *hpage;
+ struct page *orig_head;
+ struct dev_pagemap *pgmap;
+ int res;
+ unsigned long page_flags;
+
+ if (!sysctl_memory_failure_recovery)
+ panic("Memory failure on page %lx", pfn);
+
+ p = pfn_to_online_page(pfn);
+ if (!p) {
+ if (pfn_valid(pfn)) {
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ }
+ pr_err("Memory failure: %#lx: memory outside kernel control\n",
+ pfn);
+ return -ENXIO;
+ }
+
+ if (PageHuge(p))
+ return memory_failure_hugetlb(pfn, flags);
+ if (TestSetPageHWPoison(p)) {
+ pr_err("Memory failure: %#lx: already hardware poisoned\n",
+ pfn);
+ return 0;
+ }
+
+ orig_head = hpage = compound_head(p);
+ num_poisoned_pages_inc();
+
+ /*
+ * We need/can do nothing about count=0 pages.
+ * 1) it's a free page, and therefore in safe hand:
+ * prep_new_page() will be the gate keeper.
+ * 2) it's part of a non-compound high order page.
+ * Implies some kernel user: cannot stop them from
+ * R/W the page; let's pray that the page has been
+ * used and will be freed some time later.
+ * In fact it's dangerous to directly bump up page count from 0,
+ * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
+ */
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ if (is_free_buddy_page(p)) {
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
+ return 0;
+ } else {
+ action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
+ return -EBUSY;
+ }
+ }
+
+ if (PageTransHuge(hpage)) {
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
+ return -EBUSY;
+ }
+ VM_BUG_ON_PAGE(!page_count(p), p);
+ }
+
+ /*
+ * We ignore non-LRU pages for good reasons.
+ * - PG_locked is only well defined for LRU pages and a few others
+ * - to avoid races with __SetPageLocked()
+ * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+ * The check (unnecessarily) ignores LRU pages being isolated and
+ * walked by the page reclaim code, however that's not a big loss.
+ */
+ shake_page(p, 0);
+ /* shake_page could have turned it free. */
+ if (!PageLRU(p) && is_free_buddy_page(p)) {
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
+ else
+ action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
+ return 0;
+ }
+
+ lock_page(p);
+
+ /*
+ * The page could have changed compound pages during the locking.
+ * If this happens just bail out.
+ */
+ if (PageCompound(p) && compound_head(p) != orig_head) {
+ action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * We use page flags to determine what action should be taken, but
+ * the flags can be modified by the error containment action. One
+ * example is an mlocked page, where PG_mlocked is cleared by
+ * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+ * correctly, we save a copy of the page flags at this time.
+ */
+ page_flags = p->flags;
+
+ /*
+ * unpoison always clear PG_hwpoison inside page lock
+ */
+ if (!PageHWPoison(p)) {
+ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_page(p);
+ return 0;
+ }
+ if (hwpoison_filter(p)) {
+ if (TestClearPageHWPoison(p))
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_page(p);
+ return 0;
+ }
+
+ /*
+ * __munlock_pagevec may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
+ goto identify_page_state;
+
+ /*
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
+ wait_on_page_writeback(p);
+
+ /*
+ * Now take care of user space mappings.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ */
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * Torn down by someone else?
+ */
+ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+ action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+identify_page_state:
+ res = identify_page_state(pfn, p, page_flags);
+out:
+ unlock_page(p);
+ return res;
+}
+EXPORT_SYMBOL_GPL(memory_failure);
+
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ if (entry.flags & MF_SOFT_OFFLINE)
+ soft_offline_page(entry.pfn, entry.flags);
+ else
+ memory_failure(entry.pfn, entry.flags);
+ }
+}
+
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+ struct memory_failure_cpu *mf_cpu;
+
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ cancel_work_sync(&mf_cpu->work);
+ memory_failure_work_func(&mf_cpu->work);
+}
+
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
+}
+core_initcall(memory_failure_init);
+
+#define unpoison_pr_info(fmt, pfn, rs) \
+({ \
+ if (__ratelimit(rs)) \
+ pr_info(fmt, pfn); \
+})
+
+/**
+ * unpoison_memory - Unpoison a previously poisoned page
+ * @pfn: Page number of the to be unpoisoned page
+ *
+ * Software-unpoison a page that has been poisoned by
+ * memory_failure() earlier.
+ *
+ * This is only done on the software-level, so it only works
+ * for linux injected failures, not real hardware failures
+ *
+ * Returns 0 for success, otherwise -errno.
+ */
+int unpoison_memory(unsigned long pfn)
+{
+ struct page *page;
+ struct page *p;
+ int freeit = 0;
+ static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+
+ p = pfn_to_page(pfn);
+ page = compound_head(p);
+
+ if (!PageHWPoison(p)) {
+ unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ if (page_count(page) > 1) {
+ unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ if (page_mapped(page)) {
+ unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ if (page_mapping(page)) {
+ unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ /*
+ * unpoison_memory() can encounter thp only when the thp is being
+ * worked by memory_failure() and the page lock is not held yet.
+ * In such case, we yield to memory_failure() and make unpoison fail.
+ */
+ if (!PageHuge(page) && PageTransHuge(page)) {
+ unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ if (!get_hwpoison_page(p)) {
+ if (TestClearPageHWPoison(p))
+ num_poisoned_pages_dec();
+ unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
+ pfn, &unpoison_rs);
+ return 0;
+ }
+
+ lock_page(page);
+ /*
+ * This test is racy because PG_hwpoison is set outside of page lock.
+ * That's acceptable because that won't trigger kernel panic. Instead,
+ * the PG_hwpoison page will be caught and isolated on the entrance to
+ * the free buddy page pool.
+ */
+ if (TestClearPageHWPoison(page)) {
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ pfn, &unpoison_rs);
+ num_poisoned_pages_dec();
+ freeit = 1;
+ }
+ unlock_page(page);
+
+ put_page(page);
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
+ put_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL(unpoison_memory);
+
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, 1 for an in-use page, -EIO for a page-type we
+ * cannot handle and -EBUSY if we raced with an allocation.
+ * We only incremented refcount in case the page was already in-use and it is
+ * a known type we can handle.
+ */
+static int get_any_page(struct page *p, int flags)
+{
+ int ret = 0, pass = 0;
+ bool count_increased = false;
+
+ if (flags & MF_COUNT_INCREASED)
+ count_increased = true;
+
+try_again:
+ if (!count_increased && !get_hwpoison_page(p)) {
+ if (page_count(p)) {
+ /* We raced with an allocation, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EBUSY;
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
+ /* We raced with put_page, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EIO;
+ }
+ } else {
+ if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ ret = 1;
+ } else {
+ /*
+ * A page we cannot handle. Check whether we can turn
+ * it into something we can handle.
+ */
+ if (pass++ < 3) {
+ put_page(p);
+ shake_page(p, 1);
+ count_increased = false;
+ goto try_again;
+ }
+ put_page(p);
+ ret = -EIO;
+ }
+ }
+
+ return ret;
+}
+
+static bool isolate_page(struct page *page, struct list_head *pagelist)
+{
+ bool isolated = false;
+ bool lru = PageLRU(page);
+
+ if (PageHuge(page)) {
+ isolated = !isolate_hugetlb(page, pagelist);
+ } else {
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
+ }
+
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
+ /*
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
+ */
+ put_page(page);
+ return isolated;
+}
+
+/*
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int __soft_offline_page(struct page *page)
+{
+ int ret = 0;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
+ LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * Check PageHWPoison again inside page lock because PageHWPoison
+ * is set by memory_failure() outside page lock. Note that
+ * memory_failure() also double-checks PageHWPoison inside page lock,
+ * so there's no race between soft_offline_page() and memory_failure().
+ */
+ lock_page(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
+ if (PageHWPoison(page)) {
+ unlock_page(page);
+ put_page(page);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ return 0;
+ }
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
+ unlock_page(page);
+
+ /*
+ * RED-PEN would be better to keep it isolated here, but we
+ * would need to fix isolation locking first.
+ */
+ if (ret) {
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
+ page_handle_poison(page, false, true);
+ return 0;
+ }
+
+ if (isolate_page(hpage, &pagelist)) {
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
+ if (ret > 0)
+ ret = -EBUSY;
+ }
+ } else {
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int soft_offline_in_use_page(struct page *page)
+{
+ struct page *hpage = compound_head(page);
+
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
+ return -EBUSY;
+ return __soft_offline_page(page);
+}
+
+static void put_ref_page(struct page *page)
+{
+ if (page)
+ put_page(page);
+}
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @pfn: pfn to soft-offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(unsigned long pfn, int flags)
+{
+ int ret;
+ bool try_again = true;
+ struct page *page, *ref_page = NULL;
+
+ WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ if (flags & MF_COUNT_INCREASED)
+ ref_page = pfn_to_page(pfn);
+
+ /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
+ page = pfn_to_online_page(pfn);
+ if (!page) {
+ put_ref_page(ref_page);
+ return -EIO;
+ }
+
+ if (PageHWPoison(page)) {
+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
+ put_ref_page(ref_page);
+ return 0;
+ }
+
+retry:
+ get_online_mems();
+ ret = get_any_page(page, flags);
+ put_online_mems();
+
+ if (ret > 0) {
+ ret = soft_offline_in_use_page(page);
+ } else if (ret == 0) {
+ if (!page_handle_poison(page, true, false)) {
+ if (try_again) {
+ try_again = false;
+ flags &= ~MF_COUNT_INCREASED;
+ goto retry;
+ }
+ ret = -EBUSY;
+ }
+ } else if (ret == -EIO) {
+ pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
+ __func__, pfn, page->flags, &page->flags);
+ }
+
+ return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
new file mode 100644
index 000000000..1d101aeae
--- /dev/null
+++ b/mm/memory.c
@@ -0,0 +1,5343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/memory.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+/*
+ * demand-loading started 01.12.91 - seems it is high on the list of
+ * things wanted, and it should be easy to implement. - Linus
+ */
+
+/*
+ * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
+ * pages started 02.12.91, seems to work. - Linus.
+ *
+ * Tested sharing by executing about 30 /bin/sh: under the old kernel it
+ * would have taken more than the 6M I have free, but it worked well as
+ * far as I could see.
+ *
+ * Also corrected some "invalidate()"s - I wasn't doing enough of them.
+ */
+
+/*
+ * Real VM (paging to/from disk) started 18.12.91. Much more work and
+ * thought has to go into this. Oh, well..
+ * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
+ * Found it. Everything seems to work now.
+ * 20.12.91 - Ok, making the swap-device changeable like the root.
+ */
+
+/*
+ * 05.04.94 - Multi-page memory management added for v1.1.
+ * Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *
+ * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
+ * (Gerhard.Wichert@pdb.siemens.de)
+ *
+ * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/memremap.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/export.h>
+#include <linux/delayacct.h>
+#include <linux/init.h>
+#include <linux/pfn_t.h>
+#include <linux/writeback.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swapops.h>
+#include <linux/elf.h>
+#include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
+#include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/vmalloc.h>
+
+#include <trace/events/kmem.h>
+
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <linux/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#include "pgalloc-track.h"
+#include "internal.h"
+
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
+#endif
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+/* use the per-pgdat data instead for discontigmem - mbligh */
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
+EXPORT_SYMBOL(mem_map);
+#endif
+
+/*
+ * A number of key systems in x86 including ioremap() rely on the assumption
+ * that high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
+ * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+
+/*
+ * Randomize the address space (stacks, mmaps, brk, etc.).
+ *
+ * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
+ * as ancient (libc5 based) binaries can segfault. )
+ */
+int randomize_va_space __read_mostly =
+#ifdef CONFIG_COMPAT_BRK
+ 1;
+#else
+ 2;
+#endif
+
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+ /*
+ * Those arches which don't have hw access flag feature need to
+ * implement their own helper. By default, "true" means pagefault
+ * will be hit on old pte.
+ */
+ return true;
+}
+#endif
+
+static int __init disable_randmaps(char *s)
+{
+ randomize_va_space = 0;
+ return 1;
+}
+__setup("norandmaps", disable_randmaps);
+
+unsigned long zero_pfn __read_mostly;
+EXPORT_SYMBOL(zero_pfn);
+
+unsigned long highest_memmap_pfn __read_mostly;
+
+/*
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ */
+static int __init init_zero_pfn(void)
+{
+ zero_pfn = page_to_pfn(ZERO_PAGE(0));
+ return 0;
+}
+early_initcall(init_zero_pfn);
+
+void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+{
+ trace_rss_stat(mm, member, count);
+}
+
+#if defined(SPLIT_RSS_COUNTING)
+
+void sync_mm_rss(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ if (current->rss_stat.count[i]) {
+ add_mm_counter(mm, i, current->rss_stat.count[i]);
+ current->rss_stat.count[i] = 0;
+ }
+ }
+ current->rss_stat.events = 0;
+}
+
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+ struct task_struct *task = current;
+
+ if (likely(task->mm == mm))
+ task->rss_stat.count[member] += val;
+ else
+ add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+ if (unlikely(task != current))
+ return;
+ if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+ sync_mm_rss(task->mm);
+}
+#else /* SPLIT_RSS_COUNTING */
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+#endif /* SPLIT_RSS_COUNTING */
+
+/*
+ * Note: this doesn't free the actual pages themselves. That
+ * has been handled earlier when unmapping all the memory regions.
+ */
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long addr)
+{
+ pgtable_t token = pmd_pgtable(*pmd);
+ pmd_clear(pmd);
+ pte_free_tlb(tlb, token, addr);
+ mm_dec_nr_ptes(tlb->mm);
+}
+
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ free_pte_range(tlb, pmd, addr);
+ } while (pmd++, addr = next, addr != end);
+
+ start &= PUD_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PUD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pmd = pmd_offset(pud, start);
+ pud_clear(pud);
+ pmd_free_tlb(tlb, pmd, start);
+ mm_dec_nr_pmds(tlb->mm);
+}
+
+static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pud_t *pud;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+ } while (pud++, addr = next, addr != end);
+
+ start &= P4D_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= P4D_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pud = pud_offset(p4d, start);
+ p4d_clear(p4d);
+ pud_free_tlb(tlb, pud, start);
+ mm_dec_nr_puds(tlb->mm);
+}
+
+static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+ } while (p4d++, addr = next, addr != end);
+
+ start &= PGDIR_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PGDIR_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ p4d = p4d_offset(pgd, start);
+ pgd_clear(pgd);
+ p4d_free_tlb(tlb, p4d, start);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ */
+void free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ /*
+ * The next few lines have given us lots of grief...
+ *
+ * Why are we testing PMD* at this top level? Because often
+ * there will be no work to do at all, and we'd prefer not to
+ * go all the way down to the bottom just to discover that.
+ *
+ * Why all these "- 1"s? Because 0 represents both the bottom
+ * of the address space and the top of it (using -1 for the
+ * top wouldn't help much: the masks would do the wrong thing).
+ * The rule is that addr 0 and floor 0 refer to the bottom of
+ * the address space, but end 0 and ceiling 0 refer to the top
+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
+ * that end 0 case should be mythical).
+ *
+ * Wherever addr is brought up or ceiling brought down, we must
+ * be careful to reject "the opposite 0" before it confuses the
+ * subsequent tests. But what about where end is brought down
+ * by PMD_SIZE below? no, end can't go down to 0 there.
+ *
+ * Whereas we round start (addr) and ceiling down, by different
+ * masks at different levels, in order to test whether a table
+ * now has no other vmas using it, so can be freed, we don't
+ * bother to round floor or end up - the tests don't need that.
+ */
+
+ addr &= PMD_MASK;
+ if (addr < floor) {
+ addr += PMD_SIZE;
+ if (!addr)
+ return;
+ }
+ if (ceiling) {
+ ceiling &= PMD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ end -= PMD_SIZE;
+ if (addr > end - 1)
+ return;
+ /*
+ * We add page table cache pages with PAGE_SIZE,
+ * (see pte_free_tlb()), flush the tlb if we need
+ */
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ pgd = pgd_offset(tlb->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
+ } while (pgd++, addr = next, addr != end);
+}
+
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long floor, unsigned long ceiling)
+{
+ while (vma) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long addr = vma->vm_start;
+
+ /*
+ * Hide vma from rmap and truncate_pagecache before freeing
+ * pgtables
+ */
+ unlink_anon_vmas(vma);
+ unlink_file_vma(vma);
+
+ if (is_vm_hugetlb_page(vma)) {
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
+ } else {
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_vm_hugetlb_page(next)) {
+ vma = next;
+ next = vma->vm_next;
+ unlink_anon_vmas(vma);
+ unlink_file_vma(vma);
+ }
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
+ }
+ vma = next;
+ }
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ pgtable_t new = pte_alloc_one(mm);
+ if (!new)
+ return -ENOMEM;
+
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_rmb() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+
+ ptl = pmd_lock(mm, pmd);
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ mm_inc_nr_ptes(mm);
+ pmd_populate(mm, pmd, new);
+ new = NULL;
+ }
+ spin_unlock(ptl);
+ if (new)
+ pte_free(mm, new);
+ return 0;
+}
+
+int __pte_alloc_kernel(pmd_t *pmd)
+{
+ pte_t *new = pte_alloc_one_kernel(&init_mm);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ pmd_populate_kernel(&init_mm, pmd, new);
+ new = NULL;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (new)
+ pte_free_kernel(&init_mm, new);
+ return 0;
+}
+
+static inline void init_rss_vec(int *rss)
+{
+ memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
+}
+
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
+{
+ int i;
+
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (rss[i])
+ add_mm_counter(mm, i, rss[i]);
+}
+
+/*
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
+ *
+ * The calling function must still handle the error.
+ */
+static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, struct page *page)
+{
+ pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ struct address_space *mapping;
+ pgoff_t index;
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
+
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ return;
+ }
+ if (nr_unshown) {
+ pr_alert("BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
+ index = linear_page_index(vma, addr);
+
+ pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ if (page)
+ dump_page(page, "bad pte");
+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
+ vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->fault : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ mapping ? mapping->a_ops->readpage : NULL);
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+}
+
+/*
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ *
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
+ *
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
+ *
+ * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
+ if (likely(!pte_special(pte)))
+ goto check_pfn;
+ if (vma->vm_ops && vma->vm_ops->find_special_page)
+ return vma->vm_ops->find_special_page(vma, addr);
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ return NULL;
+ if (is_zero_pfn(pfn))
+ return NULL;
+ if (pte_devmap(pte))
+ return NULL;
+
+ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
+ }
+
+ /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
+
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn))
+ return NULL;
+
+check_pfn:
+ if (unlikely(pfn > highest_memmap_pfn)) {
+ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
+ }
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page tables.
+ * eg. VDSO mappings can cause them to exist.
+ */
+out:
+ return pfn_to_page(pfn);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t pmd)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ /*
+ * There is no pmd_special() but there may be special pmds, e.g.
+ * in a direct-access (dax) mapping, so let's just replicate the
+ * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
+ */
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (pmd_devmap(pmd))
+ return NULL;
+ if (is_huge_zero_pmd(pmd))
+ return NULL;
+ if (unlikely(pfn > highest_memmap_pfn))
+ return NULL;
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page tables.
+ * eg. VDSO mappings can cause them to exist.
+ */
+out:
+ return pfn_to_page(pfn);
+}
+#endif
+
+/*
+ * copy one vm_area from one task to the other. Assumes the page tables
+ * already present in the new task to be cleared in the whole range
+ * covered by this vma.
+ */
+
+static unsigned long
+copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long addr, int *rss)
+{
+ unsigned long vm_flags = dst_vma->vm_flags;
+ pte_t pte = *src_pte;
+ struct page *page;
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ rss[MM_SWAPENTS]++;
+ } else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+
+ rss[mm_counter(page)]++;
+
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
+
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
+ }
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_swp_clear_uffd_wp(pte);
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy a present and normal page if necessary.
+ *
+ * NOTE! The usual case is that this doesn't need to do
+ * anything, and can just return a positive value. That
+ * will let the caller know that it can just increase
+ * the page refcount and re-use the pte the traditional
+ * way.
+ *
+ * But _if_ we need to copy it because it needs to be
+ * pinned in the parent (and the child should get its own
+ * copy rather than just a reference to the same page),
+ * we'll do that here and return zero to let the caller
+ * know we're done.
+ *
+ * And if we need a pre-allocated page but don't yet have
+ * one, return a negative error to let the preallocation
+ * code know so that it can do so outside the page table
+ * lock.
+ */
+static inline int
+copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc, pte_t pte, struct page *page)
+{
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ struct page *new_page;
+
+ if (!is_cow_mapping(src_vma->vm_flags))
+ return 1;
+
+ /*
+ * What we want to do is to check whether this page may
+ * have been pinned by the parent process. If so,
+ * instead of wrprotect the pte on both sides, we copy
+ * the page immediately so that we'll always guarantee
+ * the pinned page won't be randomly replaced in the
+ * future.
+ *
+ * The page pinning checks are just "has this mm ever
+ * seen pinning", along with the (inexact) check of
+ * the page count. That might give false positives for
+ * for pinning, but it will work correctly.
+ */
+ if (likely(!atomic_read(&src_mm->has_pinned)))
+ return 1;
+ if (likely(!page_maybe_dma_pinned(page)))
+ return 1;
+
+ /*
+ * The vma->anon_vma of the child process may be NULL
+ * because the entire vma does not contain anonymous pages.
+ * A BUG will occur when the copy_present_page() passes
+ * a copy of a non-anonymous page of that vma to the
+ * page_add_new_anon_rmap() to set up new anonymous rmap.
+ * Return 1 if the page is not an anonymous page.
+ */
+ if (!PageAnon(page))
+ return 1;
+
+ new_page = *prealloc;
+ if (!new_page)
+ return -EAGAIN;
+
+ /*
+ * We have a prealloc page, all good! Take it
+ * over and copy the page & arm it.
+ */
+ *prealloc = NULL;
+ copy_user_highpage(new_page, page, addr, src_vma);
+ __SetPageUptodate(new_page);
+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+ rss[mm_counter(new_page)]++;
+
+ /* All done, just insert the new page copy in the child */
+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ /* Uffd-wp needs to be delivered to dest pte as well */
+ pte = pte_wrprotect(pte_mkuffd_wp(pte));
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
+ * is required to copy this pte.
+ */
+static inline int
+copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc)
+{
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ unsigned long vm_flags = src_vma->vm_flags;
+ pte_t pte = *src_pte;
+ struct page *page;
+
+ page = vm_normal_page(src_vma, addr, pte);
+ if (page) {
+ int retval;
+
+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, pte, page);
+ if (retval <= 0)
+ return retval;
+
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
+
+ /*
+ * If it's a COW mapping, write protect it both
+ * in the parent and the child
+ */
+ if (is_cow_mapping(vm_flags) && pte_write(pte)) {
+ ptep_set_wrprotect(src_mm, addr, src_pte);
+ pte = pte_wrprotect(pte);
+ }
+
+ /*
+ * If it's a shared mapping, mark it clean in
+ * the child
+ */
+ if (vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_clear_uffd_wp(pte);
+
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+static inline struct page *
+page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *new_page;
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
+ if (!new_page)
+ return NULL;
+
+ if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ return NULL;
+ }
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+
+ return new_page;
+}
+
+static int
+copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ unsigned long end)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ pte_t *orig_src_pte, *orig_dst_pte;
+ pte_t *src_pte, *dst_pte;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress, ret = 0;
+ int rss[NR_MM_COUNTERS];
+ swp_entry_t entry = (swp_entry_t){0};
+ struct page *prealloc = NULL;
+
+again:
+ progress = 0;
+ init_rss_vec(rss);
+
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ src_pte = pte_offset_map(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ orig_src_pte = src_pte;
+ orig_dst_pte = dst_pte;
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ /*
+ * We are holding two locks at this point - either of them
+ * could generate latencies in another task on another CPU.
+ */
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
+ break;
+ }
+ if (pte_none(*src_pte)) {
+ progress++;
+ continue;
+ }
+ if (unlikely(!pte_present(*src_pte))) {
+ entry.val = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
+ dst_vma, src_vma,
+ addr, rss);
+ if (entry.val)
+ break;
+ progress += 8;
+ continue;
+ }
+ /* copy_present_pte() will clear `*prealloc' if consumed */
+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, &prealloc);
+ /*
+ * If we need a pre-allocated page for this pte, drop the
+ * locks, allocate, and try again.
+ */
+ if (unlikely(ret == -EAGAIN))
+ break;
+ if (unlikely(prealloc)) {
+ /*
+ * pre-alloc page cannot be reused by next time so as
+ * to strictly follow mempolicy (e.g., alloc_page_vma()
+ * will allocate page according to address). This
+ * could only happen if one pinned pte changed.
+ */
+ put_page(prealloc);
+ prealloc = NULL;
+ }
+ progress += 8;
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(src_ptl);
+ pte_unmap(orig_src_pte);
+ add_mm_rss_vec(dst_mm, rss);
+ pte_unmap_unlock(orig_dst_pte, dst_ptl);
+ cond_resched();
+
+ if (entry.val) {
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ entry.val = 0;
+ } else if (ret) {
+ WARN_ON_ONCE(ret != -EAGAIN);
+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
+ if (!prealloc)
+ return -ENOMEM;
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
+ }
+ if (addr != end)
+ goto again;
+out:
+ if (unlikely(prealloc))
+ put_page(prealloc);
+ return ret;
+}
+
+static inline int
+copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ unsigned long end)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ pmd_t *src_pmd, *dst_pmd;
+ unsigned long next;
+
+ dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
+ if (!dst_pmd)
+ return -ENOMEM;
+ src_pmd = pmd_offset(src_pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+ || pmd_devmap(*src_pmd)) {
+ int err;
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
+ if (pmd_none_or_clear_bad(src_pmd))
+ continue;
+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
+ addr, next))
+ return -ENOMEM;
+ } while (dst_pmd++, src_pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int
+copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
+ unsigned long end)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ pud_t *src_pud, *dst_pud;
+ unsigned long next;
+
+ dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
+ if (!dst_pud)
+ return -ENOMEM;
+ src_pud = pud_offset(src_p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ int err;
+
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
+ err = copy_huge_pud(dst_mm, src_mm,
+ dst_pud, src_pud, addr, src_vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
+ if (pud_none_or_clear_bad(src_pud))
+ continue;
+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
+ addr, next))
+ return -ENOMEM;
+ } while (dst_pud++, src_pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int
+copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
+ unsigned long end)
+{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ p4d_t *src_p4d, *dst_p4d;
+ unsigned long next;
+
+ dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
+ if (!dst_p4d)
+ return -ENOMEM;
+ src_p4d = p4d_offset(src_pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(src_p4d))
+ continue;
+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
+ addr, next))
+ return -ENOMEM;
+ } while (dst_p4d++, src_p4d++, addr = next, addr != end);
+ return 0;
+}
+
+int
+copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+ pgd_t *src_pgd, *dst_pgd;
+ unsigned long next;
+ unsigned long addr = src_vma->vm_start;
+ unsigned long end = src_vma->vm_end;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ struct mmu_notifier_range range;
+ bool is_cow;
+ int ret;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly.
+ * Fork becomes much lighter when there are big shared or private
+ * readonly mappings. The tradeoff is that copy_page_range is more
+ * efficient than faulting.
+ */
+ if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !src_vma->anon_vma)
+ return 0;
+
+ if (is_vm_hugetlb_page(src_vma))
+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+
+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
+ /*
+ * We do not free on error cases below as remove_vma
+ * gets called on error from higher level routine
+ */
+ ret = track_pfn_copy(src_vma);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * We need to invalidate the secondary MMU mappings only when
+ * there could be a permission downgrade on the ptes of the
+ * parent mm. And a permission downgrade will only happen if
+ * is_cow_mapping() returns true.
+ */
+ is_cow = is_cow_mapping(src_vma->vm_flags);
+
+ if (is_cow) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, src_vma, src_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ /*
+ * Disabling preemption is not needed for the write side, as
+ * the read side doesn't spin, but goes to the mmap_lock.
+ *
+ * Use the raw variant of the seqcount_t write API to avoid
+ * lockdep complaining about preemptibility.
+ */
+ mmap_assert_write_locked(src_mm);
+ raw_write_seqcount_begin(&src_mm->write_protect_seq);
+ }
+
+ ret = 0;
+ dst_pgd = pgd_offset(dst_mm, addr);
+ src_pgd = pgd_offset(src_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(src_pgd))
+ continue;
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
+ ret = -ENOMEM;
+ break;
+ }
+ } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow) {
+ raw_write_seqcount_end(&src_mm->write_protect_seq);
+ mmu_notifier_invalidate_range_end(&range);
+ }
+ return ret;
+}
+
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+ /* By default, zap all pages */
+ if (!details)
+ return true;
+
+ /* Or, we zap COWed pages only if the caller wants to */
+ return !details->check_mapping;
+}
+
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ struct mm_struct *mm = tlb->mm;
+ int force_flush = 0;
+ int rss[NR_MM_COUNTERS];
+ spinlock_t *ptl;
+ pte_t *start_pte;
+ pte_t *pte;
+ swp_entry_t entry;
+
+ tlb_change_page_size(tlb, PAGE_SIZE);
+again:
+ init_rss_vec(rss);
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = start_pte;
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ do {
+ pte_t ptent = *pte;
+ if (pte_none(ptent))
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (pte_present(ptent)) {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (unlikely(details) && page) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping &&
+ details->check_mapping != page_rmapping(page))
+ continue;
+ }
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (unlikely(!page))
+ continue;
+
+ if (!PageAnon(page)) {
+ if (pte_dirty(ptent)) {
+ force_flush = 1;
+ set_page_dirty(page);
+ }
+ if (pte_young(ptent) &&
+ likely(!(vma->vm_flags & VM_SEQ_READ)))
+ mark_page_accessed(page);
+ }
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ if (unlikely(__tlb_remove_page(tlb, page))) {
+ force_flush = 1;
+ addr += PAGE_SIZE;
+ break;
+ }
+ continue;
+ }
+
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping !=
+ page_rmapping(page))
+ continue;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ continue;
+ }
+
+ if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ continue;
+ rss[MM_SWAPENTS]--;
+ } else if (is_migration_entry(entry)) {
+ struct page *page;
+
+ page = migration_entry_to_page(entry);
+ if (details && details->check_mapping &&
+ details->check_mapping != page_rmapping(page))
+ continue;
+ rss[mm_counter(page)]--;
+ }
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ add_mm_rss_vec(mm, rss);
+ arch_leave_lazy_mmu_mode();
+
+ /* Do the actual TLB flush before dropping ptl */
+ if (force_flush)
+ tlb_flush_mmu_tlbonly(tlb);
+ pte_unmap_unlock(start_pte, ptl);
+
+ /*
+ * If we forced a TLB flush (either due to running out of
+ * batch buffers or because we needed to flush dirty TLB
+ * entries before releasing the ptl), free the batched
+ * memory too. Restart if we didn't do everything.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ }
+
+ if (addr != end) {
+ cond_resched();
+ goto again;
+ }
+
+ return addr;
+}
+
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
+ else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
+ }
+
+ /*
+ * Here there can be other concurrent MADV_DONTNEED or
+ * trans huge page faults running, and if the pmd is
+ * none or trans huge it can change under us. This is
+ * because MADV_DONTNEED holds the mmap_lock in read
+ * mode.
+ */
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ goto next;
+ next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
+
+ return addr;
+}
+
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (next - addr != HPAGE_PUD_SIZE) {
+ mmap_assert_locked(tlb->mm);
+ split_huge_pud(vma, pud, addr);
+ } else if (zap_huge_pud(tlb, vma, pud, addr))
+ goto next;
+ /* fall through */
+ }
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+ cond_resched();
+ } while (pud++, addr = next, addr != end);
+
+ return addr;
+}
+
+static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ next = zap_pud_range(tlb, vma, p4d, addr, next, details);
+ } while (p4d++, addr = next, addr != end);
+
+ return addr;
+}
+
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
+ } while (pgd++, addr = next, addr != end);
+ tlb_end_vma(tlb, vma);
+}
+
+
+static void unmap_single_vma(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start_addr,
+ unsigned long end_addr,
+ struct zap_details *details)
+{
+ unsigned long start = max(vma->vm_start, start_addr);
+ unsigned long end;
+
+ if (start >= vma->vm_end)
+ return;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return;
+
+ if (vma->vm_file)
+ uprobe_munmap(vma, start, end);
+
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn(vma, 0, 0);
+
+ if (start != end) {
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ /*
+ * It is undesirable to test vma->vm_file as it
+ * should be non-null for valid hugetlb area.
+ * However, vm_file will be NULL in the error
+ * cleanup path of mmap_region. When
+ * hugetlbfs ->mmap method fails,
+ * mmap_region() nullifies vma->vm_file
+ * before calling this function to clean up.
+ * Since no pte has actually been setup, it is
+ * safe to do nothing in this case.
+ */
+ if (vma->vm_file) {
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+ } else
+ unmap_page_range(tlb, vma, start, end, details);
+ }
+}
+
+/**
+ * unmap_vmas - unmap a range of memory covered by a list of vma's
+ * @tlb: address of the caller's struct mmu_gather
+ * @vma: the starting vma
+ * @start_addr: virtual address at which to start unmapping
+ * @end_addr: virtual address at which to end unmapping
+ *
+ * Unmap all pages in the vma list.
+ *
+ * Only addresses between `start' and `end' will be unmapped.
+ *
+ * The VMA list must be sorted in ascending virtual address order.
+ *
+ * unmap_vmas() assumes that the caller will flush the whole unmapped address
+ * range after unmap_vmas() returns. So the only responsibility here is to
+ * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
+ * drops the lock and schedules.
+ */
+void unmap_vmas(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start_addr,
+ unsigned long end_addr)
+{
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
+ for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+}
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @start: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * Caller must protect the VMA list
+ */
+void zap_page_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long size)
+{
+ struct mmu_notifier_range range;
+ struct mmu_gather tlb;
+
+ lru_add_drain();
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, start + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, start, range.end);
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_notifier_range range;
+ struct mmu_gather tlb;
+
+ lru_add_drain();
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, address + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ unmap_single_vma(&tlb, vma, address, range.end, details);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, address, range.end);
+}
+
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ */
+void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size)
+{
+ if (address < vma->vm_start || address + size > vma->vm_end ||
+ !(vma->vm_flags & VM_PFNMAP))
+ return;
+
+ zap_page_range_single(vma, address, size, NULL);
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
+
+static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
+
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ return pmd;
+}
+
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
+{
+ pmd_t *pmd = walk_to_pmd(mm, addr);
+
+ if (!pmd)
+ return NULL;
+ return pte_alloc_map_lock(mm, pmd, addr, ptl);
+}
+
+static int validate_page_before_insert(struct page *page)
+{
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+ return -EINVAL;
+ flush_dcache_page(page);
+ return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ if (!pte_none(*pte))
+ return -EBUSY;
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ return 0;
+}
+
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, pgprot_t prot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int retval;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ retval = validate_page_before_insert(page);
+ if (retval)
+ goto out;
+ retval = -ENOMEM;
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ pte_unmap_unlock(pte, ptl);
+out:
+ return retval;
+}
+
+#ifdef pte_index
+static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ int err;
+
+ if (!page_count(page))
+ return -EINVAL;
+ err = validate_page_before_insert(page);
+ if (err)
+ return err;
+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+}
+
+/* insert_pages() amortizes the cost of spinlock operations
+ * when inserting pages in a loop. Arch *must* define pte_index.
+ */
+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num, pgprot_t prot)
+{
+ pmd_t *pmd = NULL;
+ pte_t *start_pte, *pte;
+ spinlock_t *pte_lock;
+ struct mm_struct *const mm = vma->vm_mm;
+ unsigned long curr_page_idx = 0;
+ unsigned long remaining_pages_total = *num;
+ unsigned long pages_to_write_in_pmd;
+ int ret;
+more:
+ ret = -EFAULT;
+ pmd = walk_to_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ pages_to_write_in_pmd = min_t(unsigned long,
+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
+
+ /* Allocate the PTE if necessary; takes PMD lock once only. */
+ ret = -ENOMEM;
+ if (pte_alloc(mm, pmd))
+ goto out;
+
+ while (pages_to_write_in_pmd) {
+ int pte_idx = 0;
+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
+
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pte,
+ addr, pages[curr_page_idx], prot);
+ if (unlikely(err)) {
+ pte_unmap_unlock(start_pte, pte_lock);
+ ret = err;
+ remaining_pages_total -= pte_idx;
+ goto out;
+ }
+ addr += PAGE_SIZE;
+ ++curr_page_idx;
+ }
+ pte_unmap_unlock(start_pte, pte_lock);
+ pages_to_write_in_pmd -= batch_size;
+ remaining_pages_total -= batch_size;
+ }
+ if (remaining_pages_total)
+ goto more;
+ ret = 0;
+out:
+ *num = remaining_pages_total;
+ return ret;
+}
+#endif /* ifdef pte_index */
+
+/**
+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
+ * @vma: user vma to map to
+ * @addr: target start user address of these pages
+ * @pages: source kernel pages
+ * @num: in: number of pages to map. out: number of pages that were *not*
+ * mapped. (0 means all pages were successfully mapped).
+ *
+ * Preferred over vm_insert_page() when inserting multiple pages.
+ *
+ * In case of error, we may have mapped a subset of the provided
+ * pages. It is the caller's responsibility to account for this case.
+ *
+ * The same restrictions apply as in vm_insert_page().
+ */
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+#ifdef pte_index
+ const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
+
+ if (addr < vma->vm_start || end_addr >= vma->vm_end)
+ return -EFAULT;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ /* Defer page refcount checking till we're about to map that page. */
+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
+#else
+ unsigned long idx = 0, pgcount = *num;
+ int err = -EINVAL;
+
+ for (; idx < pgcount; ++idx) {
+ err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
+ if (err)
+ break;
+ }
+ *num = pgcount - idx;
+ return err;
+#endif /* ifdef pte_index */
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (see split_page()).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
+{
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ if (!page_count(page))
+ return -EINVAL;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ return insert_page(vma, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num, unsigned long offset)
+{
+ unsigned long count = vma_pages(vma);
+ unsigned long uaddr = vma->vm_start;
+ int ret, i;
+
+ /* Fail if the user requested offset is beyond the end of the object */
+ if (offset >= num)
+ return -ENXIO;
+
+ /* Fail if the user requested size exceeds available object size */
+ if (count > num - offset)
+ return -ENXIO;
+
+ for (i = 0; i < count; i++) {
+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+ if (ret < 0)
+ return ret;
+ uaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present. Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *pte, entry;
+ spinlock_t *ptl;
+
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ return VM_FAULT_OOM;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
+ */
+ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+ goto out_unlock;
+ }
+ entry = pte_mkyoung(*pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+ update_mmu_cache(vma, addr, pte);
+ }
+ goto out_unlock;
+ }
+
+ /* Ok, finally just insert the thing.. */
+ if (pfn_t_devmap(pfn))
+ entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+ else
+ entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
+ set_pte_at(mm, addr, pte, entry);
+ update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
+
+out_unlock:
+ pte_unmap_unlock(pte, ptl);
+ return VM_FAULT_NOPAGE;
+}
+
+/**
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ *
+ * See vmf_insert_mixed_prot() for a discussion of the implication of using
+ * a value of @pgprot different from that of @vma->vm_page_prot.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t pgprot)
+{
+ /*
+ * Technically, architectures with pte_special can avoid all these
+ * restrictions (same for remap_pfn_range). However we would like
+ * consistency in testing and feature parity among all, so we should
+ * try to keep these invariants in place for everybody.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
+
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vmf_insert_pfn);
+
+static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
+{
+ /* these checks mirror the abort conditions in vm_normal_page */
+ if (vma->vm_flags & VM_MIXEDMAP)
+ return true;
+ if (pfn_t_devmap(pfn))
+ return true;
+ if (pfn_t_special(pfn))
+ return true;
+ if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ return true;
+ return false;
+}
+
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, pgprot_t pgprot,
+ bool mkwrite)
+{
+ int err;
+
+ BUG_ON(!vm_mixed_ok(vma, pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
+ if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+ * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+ * refcount the page if pfn_valid is true (hence insert_page rather
+ * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
+ * without pte special, it would there be refcounted as a normal page.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
+ !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ struct page *page;
+
+ /*
+ * At this point we are committed to insert_page()
+ * regardless of whether the caller specified flags that
+ * result in pfn_t_has_page() == false.
+ */
+ page = pfn_to_page(pfn_t_to_pfn(pfn));
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+ }
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+
+/**
+ * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
+ * to override pgprot on a per-page basis.
+ *
+ * Typically this function should be used by drivers to set caching- and
+ * encryption bits different than those of @vma->vm_page_prot, because
+ * the caching- or encryption mode may not be known at mmap() time.
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, pgprot_t pgprot)
+{
+ return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
+}
+EXPORT_SYMBOL(vmf_insert_mixed_prot);
+
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
+}
+EXPORT_SYMBOL(vmf_insert_mixed);
+
+/*
+ * If the insertion of PTE failed because someone else already added a
+ * different entry in the mean time, we treat that as success as we assume
+ * the same entry was actually inserted.
+ */
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
+}
+EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
+
+/*
+ * maps a range of physical memory into the requested pages. the old
+ * mappings are removed. any references to nonexistent pages results
+ * in null mappings (currently treated as "copy-on-access")
+ */
+static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pte_t *pte, *mapped_pte;
+ spinlock_t *ptl;
+ int err = 0;
+
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
+ do {
+ BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(mapped_pte, ptl);
+ return err;
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ do {
+ next = pmd_addr_end(addr, end);
+ err = remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err;
+
+ pfn -= addr >> PAGE_SHIFT;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ err = remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int err;
+
+ pfn -= addr >> PAGE_SHIFT;
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ err = remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + PAGE_ALIGN(size);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long remap_pfn = pfn;
+ int err;
+
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+ return -EINVAL;
+
+ /*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * VM_IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
+ *
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
+ */
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return -EINVAL;
+ vma->vm_pgoff = pfn;
+ }
+
+ err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
+ if (err)
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+ BUG_ON(addr >= end);
+ pfn -= addr >> PAGE_SHIFT;
+ pgd = pgd_offset(mm, addr);
+ flush_cache_range(vma, addr, end);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = remap_p4d_range(mm, pgd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ if (err)
+ untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+
+ return err;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of the physical memory to be mapped
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+ unsigned long vm_len, pfn, pages;
+
+ /* Check that the physical memory area passed in looks valid */
+ if (start + len < start)
+ return -EINVAL;
+ /*
+ * You *really* shouldn't map things that aren't page-aligned,
+ * but we've historically allowed it because IO memory might
+ * just have smaller alignment.
+ */
+ len += start & ~PAGE_MASK;
+ pfn = start >> PAGE_SHIFT;
+ pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ /* We start the mapping 'vm_pgoff' pages into the area */
+ if (vma->vm_pgoff > pages)
+ return -EINVAL;
+ pfn += vma->vm_pgoff;
+ pages -= vma->vm_pgoff;
+
+ /* Can we fit all of the mapping? */
+ vm_len = vma->vm_end - vma->vm_start;
+ if (vm_len >> PAGE_SHIFT > pages)
+ return -EINVAL;
+
+ /* Ok, let it rip */
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ int err = 0;
+ spinlock_t *ptl;
+
+ if (create) {
+ pte = (mm == &init_mm) ?
+ pte_alloc_kernel_track(pmd, addr, mask) :
+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+ } else {
+ pte = (mm == &init_mm) ?
+ pte_offset_kernel(pmd, addr) :
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ }
+
+ BUG_ON(pmd_huge(*pmd));
+
+ arch_enter_lazy_mmu_mode();
+
+ if (fn) {
+ do {
+ if (create || !pte_none(*pte)) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
+ } while (addr += PAGE_SIZE, addr != end);
+ }
+ *mask |= PGTBL_PTE_MODIFIED;
+
+ arch_leave_lazy_mmu_mode();
+
+ if (mm != &init_mm)
+ pte_unmap_unlock(pte-1, ptl);
+ return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err = 0;
+
+ BUG_ON(pud_huge(*pud));
+
+ if (create) {
+ pmd = pmd_alloc_track(mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ } else {
+ pmd = pmd_offset(pud, addr);
+ }
+ do {
+ next = pmd_addr_end(addr, end);
+ if (create || !pmd_none_or_clear_bad(pmd)) {
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
+ create, mask);
+ if (err)
+ break;
+ }
+ } while (pmd++, addr = next, addr != end);
+ return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err = 0;
+
+ if (create) {
+ pud = pud_alloc_track(mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ } else {
+ pud = pud_offset(p4d, addr);
+ }
+ do {
+ next = pud_addr_end(addr, end);
+ if (create || !pud_none_or_clear_bad(pud)) {
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
+ create, mask);
+ if (err)
+ break;
+ }
+ } while (pud++, addr = next, addr != end);
+ return err;
+}
+
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int err = 0;
+
+ if (create) {
+ p4d = p4d_alloc_track(mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ } else {
+ p4d = p4d_offset(pgd, addr);
+ }
+ do {
+ next = p4d_addr_end(addr, end);
+ if (create || !p4d_none_or_clear_bad(p4d)) {
+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
+ create, mask);
+ if (err)
+ break;
+ }
+ } while (p4d++, addr = next, addr != end);
+ return err;
+}
+
+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn,
+ void *data, bool create)
+{
+ pgd_t *pgd;
+ unsigned long start = addr, next;
+ unsigned long end = addr + size;
+ pgtbl_mod_mask mask = 0;
+ int err = 0;
+
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (!create && pgd_none_or_clear_bad(pgd))
+ continue;
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, start + size);
+
+ return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ return __apply_to_page_range(mm, addr, size, fn, data, true);
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
+/*
+ * Scan a region of virtual memory, calling a provided function on
+ * each leaf page table where it exists.
+ *
+ * Unlike apply_to_page_range, this does _not_ fill in page tables
+ * where they are absent.
+ */
+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ return __apply_to_page_range(mm, addr, size, fn, data, false);
+}
+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
+
+/*
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically. Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
+ * and do_anonymous_page can safely check later on).
+ */
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
+ pte_t *page_table, pte_t orig_pte)
+{
+ int same = 1;
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ same = pte_same(*page_table, orig_pte);
+ spin_unlock(ptl);
+ }
+#endif
+ pte_unmap(page_table);
+ return same;
+}
+
+static inline bool cow_user_page(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
+{
+ bool ret;
+ void *kaddr;
+ void __user *uaddr;
+ bool locked = false;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = vmf->address;
+
+ if (likely(src)) {
+ copy_user_highpage(dst, src, addr, vma);
+ return true;
+ }
+
+ /*
+ * If the source page was a PFN mapping, we don't have
+ * a "struct page" for it. We do a best-effort copy by
+ * just copying from the original user address. If that
+ * fails, we just zero-fill it. Live with it.
+ */
+ kaddr = kmap_atomic(dst);
+ uaddr = (void __user *)(addr & PAGE_MASK);
+
+ /*
+ * On architectures with software "accessed" bits, we would
+ * take a double page fault, so mark it accessed here.
+ */
+ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ pte_t entry;
+
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /*
+ * Other thread has already handled the fault
+ * and update local tlb only
+ */
+ update_mmu_tlb(vma, addr, vmf->pte);
+ ret = false;
+ goto pte_unlock;
+ }
+
+ entry = pte_mkyoung(vmf->orig_pte);
+ if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+ update_mmu_cache(vma, addr, vmf->pte);
+ }
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ if (locked)
+ goto warn;
+
+ /* Re-validate under PTL if the page is still mapped */
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /* The PTE changed under us, update local tlb */
+ update_mmu_tlb(vma, addr, vmf->pte);
+ ret = false;
+ goto pte_unlock;
+ }
+
+ /*
+ * The same page can be mapped back since last copy attempt.
+ * Try to copy again under PTL.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ /*
+ * Give a warn in case there can be some obscure
+ * use-case
+ */
+warn:
+ WARN_ON_ONCE(1);
+ clear_page(kaddr);
+ }
+ }
+
+ ret = true;
+
+pte_unlock:
+ if (locked)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+
+ return ret;
+}
+
+static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
+{
+ struct file *vm_file = vma->vm_file;
+
+ if (vm_file)
+ return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
+
+ /*
+ * Special mappings (e.g. VDSO) do not have any file so fake
+ * a default GFP_KERNEL for them.
+ */
+ return GFP_KERNEL;
+}
+
+/*
+ * Notify the address space that the page is about to become writable so that
+ * it can prohibit this or wait for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can sleep if it needs to.
+ */
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+{
+ vm_fault_t ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
+
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+
+ if (vmf->vma->vm_file &&
+ IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
+ return VM_FAULT_SIGBUS;
+
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
+ if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ return 0; /* retry */
+ }
+ ret |= VM_FAULT_LOCKED;
+ } else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ return ret;
+}
+
+/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct address_space *mapping;
+ struct page *page = vmf->page;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+
+ /*
+ * Throttle page dirtying rate down to writeback speed.
+ *
+ * mapping may be NULL here because some device drivers do not
+ * set page.mapping but still dirty their pages
+ *
+ * Drop the mmap_lock before waiting on IO, if we can. The file
+ * is pinning the mapping, as per above.
+ */
+ if ((dirtied || page_mkwrite) && mapping) {
+ struct file *fpin;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ balance_dirty_pages_ratelimited(mapping);
+ if (fpin) {
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Handle write page faults for pages that can be reused in the current vma
+ *
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
+ */
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
+ pte_t entry;
+ /*
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
+
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ count_vm_event(PGREUSE);
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_lock locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
+ struct page *new_page = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ struct mmu_notifier_range range;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
+ if (!new_page)
+ goto oom;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
+ if (!new_page)
+ goto oom;
+
+ if (!cow_user_page(new_page, old_page, vmf)) {
+ /*
+ * COW failed, if the fault was solved by other,
+ * it's fine. If not, userspace would re-fault on
+ * the same address and we will handle the fault
+ * from the second attempt.
+ */
+ put_page(new_page);
+ if (old_page)
+ put_page(old_page);
+ return 0;
+ }
+ }
+
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+ goto oom_free_new;
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+
+ __SetPageUptodate(new_page);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ /*
+ * Re-check the pte - we dropped the lock
+ */
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (old_page) {
+ if (!PageAnon(old_page)) {
+ dec_mm_counter_fast(mm,
+ mm_counter_file(old_page));
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
+ } else {
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
+ /*
+ * We call the notify macro here because, when using secondary
+ * mmu page tables (such as kvm shadow page tables), we want the
+ * new page to be mapped directly into the secondary page table.
+ */
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ if (old_page) {
+ /*
+ * Only after switching the pte to the new page may
+ * we remove the mapcount here. Otherwise another
+ * process may come and find the rmap count decremented
+ * before the pte is switched to the new page, and
+ * "reuse" the old page writing into it while our pte
+ * here still points into it and can be read by other
+ * threads.
+ *
+ * The critical issue is to order this
+ * page_remove_rmap with the ptp_clear_flush above.
+ * Those stores are ordered by (if nothing else,)
+ * the barrier present in the atomic_add_negative
+ * in page_remove_rmap.
+ *
+ * Then the TLB flush in ptep_clear_flush ensures that
+ * no process can access the old page before the
+ * decremented mapcount is visible. And the old page
+ * cannot be reused until after the decremented
+ * mapcount is visible. So transitively, TLBs to
+ * old page will be flushed before it can be reused.
+ */
+ page_remove_rmap(old_page, false);
+ }
+
+ /* Free the old page.. */
+ new_page = old_page;
+ page_copied = 1;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ }
+
+ if (new_page)
+ put_page(new_page);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(&range);
+ if (old_page) {
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+ lock_page(old_page); /* LRU manipulation */
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
+ unlock_page(old_page);
+ }
+ put_page(old_page);
+ }
+ return page_copied ? VM_FAULT_WRITE : 0;
+oom_free_new:
+ put_page(new_page);
+oom:
+ if (old_page)
+ put_page(old_page);
+ return VM_FAULT_OOM;
+}
+
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ *
+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * we acquired PTE lock.
+ */
+vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+ vm_fault_t ret;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
+ return ret;
+ return finish_mkwrite_fault(vmf);
+ }
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+}
+
+static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = VM_FAULT_WRITE;
+
+ get_page(vmf->page);
+
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ vm_fault_t tmp;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ put_page(vmf->page);
+ return tmp;
+ }
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
+ }
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
+ }
+ ret |= fault_dirty_shared_page(vmf);
+ put_page(vmf->page);
+
+ return ret;
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
+ */
+static vm_fault_t do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
+
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable and/or call ops->pfn_mkwrite.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_pfn_shared(vmf);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(vmf->page)) {
+ struct page *page = vmf->page;
+
+ /* PageKsm() doesn't necessarily raise the page refcount */
+ if (PageKsm(page) || page_count(page) != 1)
+ goto copy;
+ if (!trylock_page(page))
+ goto copy;
+ if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ unlock_page(page);
+ goto copy;
+ }
+ /*
+ * Ok, we've got the only map reference, and the only
+ * page count reference, and the page is locked,
+ * it's dark out, and we're wearing sunglasses. Hit it.
+ */
+ unlock_page(page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(vmf);
+ }
+copy:
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ get_page(vmf->page);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
+}
+
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr,
+ struct zap_details *details)
+{
+ zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
+}
+
+static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
+ struct zap_details *details)
+{
+ struct vm_area_struct *vma;
+ pgoff_t vba, vea, zba, zea;
+
+ vma_interval_tree_foreach(vma, root,
+ details->first_index, details->last_index) {
+
+ vba = vma->vm_pgoff;
+ vea = vba + vma_pages(vma) - 1;
+ zba = details->first_index;
+ if (zba < vba)
+ zba = vba;
+ zea = details->last_index;
+ if (zea > vea)
+ zea = vea;
+
+ unmap_mapping_range_vma(vma,
+ ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
+ ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
+ details);
+ }
+}
+
+/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + thp_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
+ * unmap_mapping_pages() - Unmap pages from processes.
+ * @mapping: The address space containing pages to be unmapped.
+ * @start: Index of first page to be unmapped.
+ * @nr: Number of pages to be unmapped. 0 to unmap to end of file.
+ * @even_cows: Whether to unmap even private COWed pages.
+ *
+ * Unmap the pages in this address space from any userspace process which
+ * has them mmaped. Generally, you want to remove COWed pages as well when
+ * a file is being truncated, but not when invalidating pages from the page
+ * cache.
+ */
+void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
+ pgoff_t nr, bool even_cows)
+{
+ struct zap_details details = { };
+
+ details.check_mapping = even_cows ? NULL : mapping;
+ details.first_index = start;
+ details.last_index = start + nr - 1;
+ if (details.last_index < details.first_index)
+ details.last_index = ULONG_MAX;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified byte range in the underlying
+ * file.
+ *
+ * @mapping: the address space containing mmaps to be unmapped.
+ * @holebegin: byte in first page to unmap, relative to the start of
+ * the underlying file. This will be rounded down to a PAGE_SIZE
+ * boundary. Note that this is different from truncate_pagecache(), which
+ * must keep the partial page. In contrast, we must get rid of
+ * partial pages.
+ * @holelen: size of prospective hole in bytes. This will be rounded
+ * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
+ * end of the file.
+ * @even_cows: 1 when truncating a file, unmap even private COWed pages;
+ * but 0 when invalidating pagecache, don't throw away private data.
+ */
+void unmap_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen, int even_cows)
+{
+ pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
+ pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ /* Check for overflow. */
+ if (sizeof(holelen) > sizeof(hlen)) {
+ long long holeend =
+ (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (holeend & ~(long long)ULONG_MAX)
+ hlen = ULONG_MAX - hba + 1;
+ }
+
+ unmap_mapping_pages(mapping, hba, hlen, even_cows);
+}
+EXPORT_SYMBOL(unmap_mapping_range);
+
+/*
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with pte unmapped and unlocked.
+ *
+ * We return with the mmap_lock locked or unlocked in the same cases
+ * as does filemap_fault().
+ */
+vm_fault_t do_swap_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = NULL, *swapcache;
+ swp_entry_t entry;
+ pte_t pte;
+ int locked;
+ int exclusive = 0;
+ vm_fault_t ret = 0;
+ void *shadow = NULL;
+
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ goto out;
+
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ if (unlikely(non_swap_entry(entry))) {
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
+ } else if (is_device_private_entry(entry)) {
+ vmf->page = device_private_entry_to_page(entry);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ } else if (is_hwpoison_entry(entry)) {
+ ret = VM_FAULT_HWPOISON;
+ } else {
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
+ ret = VM_FAULT_SIGBUS;
+ }
+ goto out;
+ }
+
+
+ delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+ page = lookup_swap_cache(entry, vma, vmf->address);
+ swapcache = page;
+
+ if (!page) {
+ struct swap_info_struct *si = swp_swap_info(entry);
+
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
+ __swap_count(entry) == 1) {
+ /* skip swapcache */
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
+ if (page) {
+ int err;
+
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ set_page_private(page, entry.val);
+
+ /* Tell memcg to use swap ownership records */
+ SetPageSwapCache(page);
+ err = mem_cgroup_charge(page, vma->vm_mm,
+ GFP_KERNEL);
+ ClearPageSwapCache(page);
+ if (err) {
+ ret = VM_FAULT_OOM;
+ goto out_page;
+ }
+
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(page, shadow);
+
+ lru_cache_add(page);
+ swap_readpage(page, true);
+ }
+ } else {
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ vmf);
+ swapcache = page;
+ }
+
+ if (!page) {
+ /*
+ * Back out if somebody else faulted in this pte
+ * while we released the pte lock.
+ */
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ ret = VM_FAULT_OOM;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ goto unlock;
+ }
+
+ /* Had to read the page from swap area: Major fault */
+ ret = VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+ } else if (PageHWPoison(page)) {
+ /*
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
+ */
+ ret = VM_FAULT_HWPOISON;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ goto out_release;
+ }
+
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ if (!locked) {
+ ret |= VM_FAULT_RETRY;
+ goto out_release;
+ }
+
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
+ */
+ if (unlikely((!PageSwapCache(page) ||
+ page_private(page) != entry.val)) && swapcache)
+ goto out_page;
+
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
+
+ /*
+ * Back out if somebody else already faulted in this pte.
+ */
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+ goto out_nomap;
+
+ if (unlikely(!PageUptodate(page))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_nomap;
+ }
+
+ /*
+ * The page isn't present yet, go ahead with the fault.
+ *
+ * Be careful about the sequence of operations here.
+ * To get its accounting right, reuse_swap_page() must be called
+ * while the page is counted on swap but not yet in mapcount i.e.
+ * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+ * must be called after the swap_free(), or it will never succeed.
+ */
+
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+ pte = mk_pte(page, vma->vm_page_prot);
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ vmf->flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = RMAP_EXCLUSIVE;
+ }
+ flush_icache_page(vma, page);
+ if (pte_swp_soft_dirty(vmf->orig_pte))
+ pte = pte_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ pte = pte_mkuffd_wp(pte);
+ pte = pte_wrprotect(pte);
+ }
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+ vmf->orig_pte = pte;
+
+ /* ksm created a completely new copy */
+ if (unlikely(page != swapcache && swapcache)) {
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ } else {
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ }
+
+ swap_free(entry);
+ if (mem_cgroup_swap_full(page) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ try_to_free_swap(page);
+ unlock_page(page);
+ if (page != swapcache && swapcache) {
+ /*
+ * Hold the lock to avoid the swap entry to be reused
+ * until we take the PT lock for the pte_same() check
+ * (to avoid false positives from pte_same). For
+ * further safety release the lock after the swap_free
+ * so that the swap count won't change under a
+ * parallel locked swapcache.
+ */
+ unlock_page(swapcache);
+ put_page(swapcache);
+ }
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
+ if (ret & VM_FAULT_ERROR)
+ ret &= VM_FAULT_ERROR;
+ goto out;
+ }
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+unlock:
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
+ return ret;
+out_nomap:
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+out_page:
+ unlock_page(page);
+out_release:
+ put_page(page);
+ if (page != swapcache && swapcache) {
+ unlock_page(swapcache);
+ put_page(swapcache);
+ }
+ return ret;
+}
+
+/*
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
+ */
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ vm_fault_t ret = 0;
+ pte_t entry;
+
+ /* File mapping without ->vm_ops ? */
+ if (vma->vm_flags & VM_SHARED)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have mmap_read_lock(mm).
+ */
+ if (pte_alloc(vma->vm_mm, vmf->pmd))
+ return VM_FAULT_OOM;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
+ return 0;
+
+ /* Use the zero-page for reads */
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
+ !mm_forbids_zeropage(vma->vm_mm)) {
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
+ vma->vm_page_prot));
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ goto unlock;
+ }
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
+ }
+ goto setpte;
+ }
+
+ /* Allocate our own private page. */
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
+ if (!page)
+ goto oom;
+
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ goto oom_free_page;
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ goto release;
+ }
+
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(page);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
+ }
+
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+setpte:
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+unlock:
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+release:
+ put_page(page);
+ goto unlock;
+oom_free_page:
+ put_page(page);
+oom:
+ return VM_FAULT_OOM;
+}
+
+/*
+ * The mmap_lock must have been held on entry, and may have been
+ * released depending on flags and vma->vm_ops->fault() return value.
+ * See filemap_fault() and __lock_page_retry().
+ */
+static vm_fault_t __do_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret;
+
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_one
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ ret = vma->vm_ops->fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
+ return ret;
+
+ if (unlikely(PageHWPoison(vmf->page))) {
+ struct page *page = vmf->page;
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
+ /* Retry if a clean page was removed from the cache. */
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
+ }
+ put_page(page);
+ vmf->page = NULL;
+ return poisonret;
+ }
+
+ if (unlikely(!(ret & VM_FAULT_LOCKED)))
+ lock_page(vmf->page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
+
+ return ret;
+}
+
+/*
+ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
+ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+ return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
+static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (!pmd_none(*vmf->pmd))
+ goto map_pte;
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
+ goto map_pte;
+ }
+
+ mm_inc_nr_ptes(vma->vm_mm);
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = NULL;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
+ return VM_FAULT_OOM;
+ }
+map_pte:
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
+ * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
+ * under us and then back to pmd_none, as a result of MADV_DONTNEED
+ * running immediately after a huge pmd fault in a different thread of
+ * this mm, in turn leading to a misleading pmd_trans_huge() retval.
+ * All we have to ensure is that it is a regular pmd that we can walk
+ * with pte_offset_map() and we can do that through an atomic read in
+ * C, which is what pmd_trans_unstable() provides.
+ */
+ if (pmd_devmap_trans_unstable(vmf->pmd))
+ return VM_FAULT_NOPAGE;
+
+ /*
+ * At this point we know that our vmf->pmd points to a page of ptes
+ * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
+ * for the duration of the fault. If a racing MADV_DONTNEED runs and
+ * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
+ * be valid and we will re-check to make sure the vmf->pte isn't
+ * pte_none() under vmf->ptl protection when we return to
+ * alloc_set_pte().
+ */
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ return 0;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void deposit_prealloc_pte(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ /*
+ * We are going to consume the prealloc table,
+ * count that as nr_ptes.
+ */
+ mm_inc_nr_ptes(vma->vm_mm);
+ vmf->prealloc_pte = NULL;
+}
+
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ pmd_t entry;
+ int i;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ if (!transhuge_vma_suitable(vma, haddr))
+ return ret;
+
+ page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
+
+ /*
+ * Archs like ppc64 need additonal space to store information
+ * related to pte entry. Use the preallocated table for that.
+ */
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
+ goto out;
+
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ flush_icache_page(vma, page + i);
+
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ if (write)
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+ page_add_file_rmap(page, true);
+ /*
+ * deposit and withdraw with pmd lock held
+ */
+ if (arch_needs_pgtable_deposit())
+ deposit_prealloc_pte(vmf);
+
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
+
+ /* fault is handled */
+ ret = 0;
+ count_vm_event(THP_FILE_MAPPED);
+out:
+ spin_unlock(vmf->ptl);
+ return ret;
+}
+#else
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
+/**
+ * alloc_set_pte - setup new PTE entry for given page and add reverse page
+ * mapping. If needed, the function allocates page table or use pre-allocated.
+ *
+ * @vmf: fault environment
+ * @page: page to map
+ *
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
+ */
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ pte_t entry;
+ vm_fault_t ret;
+
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
+ ret = do_set_pmd(vmf, page);
+ if (ret != VM_FAULT_FALLBACK)
+ return ret;
+ }
+
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
+ if (ret)
+ return ret;
+ }
+
+ /* Re-check under ptl */
+ if (unlikely(!pte_none(*vmf->pte))) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
+ return VM_FAULT_NOPAGE;
+ }
+
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
+ if (write)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ /* copy-on-write page */
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ } else {
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ }
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ return 0;
+}
+
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
+ */
+vm_fault_t finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ vm_fault_t ret = 0;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+
+ /*
+ * check even for read faults because we might have lost our CoWed
+ * page
+ */
+ if (!(vmf->vma->vm_flags & VM_SHARED))
+ ret = check_stable_address_space(vmf->vma->vm_mm);
+ if (!ret)
+ ret = alloc_set_pte(vmf, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+}
+
+static unsigned long fault_around_bytes __read_mostly =
+ rounddown_pow_of_two(65536);
+
+#ifdef CONFIG_DEBUG_FS
+static int fault_around_bytes_get(void *data, u64 *val)
+{
+ *val = fault_around_bytes;
+ return 0;
+}
+
+/*
+ * fault_around_bytes must be rounded down to the nearest page order as it's
+ * what do_fault_around() expects to see.
+ */
+static int fault_around_bytes_set(void *data, u64 val)
+{
+ if (val / PAGE_SIZE > PTRS_PER_PTE)
+ return -EINVAL;
+ if (val > PAGE_SIZE)
+ fault_around_bytes = rounddown_pow_of_two(val);
+ else
+ fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
+ fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
+ return 0;
+}
+late_initcall(fault_around_debugfs);
+#endif
+
+/*
+ * do_fault_around() tries to map few pages around the fault address. The hope
+ * is that the pages will be needed soon and this will lower the number of
+ * faults to handle.
+ *
+ * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
+ * not ready to be mapped: not up-to-date, locked, etc.
+ *
+ * This function is called with the page table lock taken. In the split ptlock
+ * case the page table lock only protects only those entries which belong to
+ * the page table corresponding to the fault address.
+ *
+ * This function doesn't cross the VMA boundaries, in order to call map_pages()
+ * only once.
+ *
+ * fault_around_bytes defines how many bytes we'll try to map.
+ * do_fault_around() expects it to be set to a power of two less than or equal
+ * to PTRS_PER_PTE.
+ *
+ * The virtual address of the area that we map is naturally aligned to
+ * fault_around_bytes rounded down to the machine page size
+ * (and therefore to page order). This way it's easier to guarantee
+ * that we don't cross page table boundaries.
+ */
+static vm_fault_t do_fault_around(struct vm_fault *vmf)
+{
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
+ pgoff_t end_pgoff;
+ int off;
+ vm_fault_t ret = 0;
+
+ nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+ mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
+
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ start_pgoff -= off;
+
+ /*
+ * end_pgoff is either the end of the page table, the end of
+ * the vma or nr_pages from start_pgoff, depending what is nearest.
+ */
+ end_pgoff = start_pgoff -
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ PTRS_PER_PTE - 1;
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
+ start_pgoff + nr_pages - 1);
+
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ goto out;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
+
+ /* Huge page is mapped? Page fault is solved */
+ if (pmd_trans_huge(*vmf->pmd)) {
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ /* ->map_pages() haven't done anything useful. Cold page cache? */
+ if (!vmf->pte)
+ goto out;
+
+ /* check if the page fault is solved */
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
+ ret = VM_FAULT_NOPAGE;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+out:
+ vmf->address = address;
+ vmf->pte = NULL;
+ return ret;
+}
+
+static vm_fault_t do_read_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+
+ /*
+ * Let's call ->map_pages() first and use ->fault() as fallback
+ * if page by the offset is not ready to be mapped (cold cache or
+ * something).
+ */
+ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
+ }
+
+ ret = __do_fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ put_page(vmf->page);
+ return ret;
+}
+
+static vm_fault_t do_cow_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
+ put_page(vmf->cow_page);
+ return VM_FAULT_OOM;
+ }
+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+
+ ret = __do_fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
+
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
+
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ goto uncharge_out;
+ return ret;
+uncharge_out:
+ put_page(vmf->cow_page);
+ return ret;
+}
+
+static vm_fault_t do_shared_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret, tmp;
+
+ ret = __do_fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ /*
+ * Check if the backing address space wants to know that the page is
+ * about to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
+ if (unlikely(!tmp ||
+ (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ put_page(vmf->page);
+ return tmp;
+ }
+ }
+
+ ret |= finish_fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return ret;
+ }
+
+ ret |= fault_dirty_shared_page(vmf);
+ return ret;
+}
+
+/*
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
+ * but allow concurrent faults).
+ * The mmap_lock may have been released depending on flags and our
+ * return value. See filemap_fault() and __lock_page_or_retry().
+ * If mmap_lock is released, vma may become invalid (for example
+ * by other thread calling munmap()).
+ */
+static vm_fault_t do_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *vm_mm = vma->vm_mm;
+ vm_fault_t ret;
+
+ /*
+ * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+ */
+ if (!vma->vm_ops->fault) {
+ /*
+ * If we find a migration pmd entry or a none pmd entry, which
+ * should never happen, return SIGBUS
+ */
+ if (unlikely(!pmd_present(*vmf->pmd)))
+ ret = VM_FAULT_SIGBUS;
+ else {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+ vmf->pmd,
+ vmf->address,
+ &vmf->ptl);
+ /*
+ * Make sure this is not a temporary clearing of pte
+ * by holding ptl and checking again. A R/M/W update
+ * of pte involves: take ptl, clearing the pte so that
+ * we don't have concurrent modification by hardware
+ * followed by an update.
+ */
+ if (unlikely(pte_none(*vmf->pte)))
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = VM_FAULT_NOPAGE;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
+ } else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ ret = do_read_fault(vmf);
+ else if (!(vma->vm_flags & VM_SHARED))
+ ret = do_cow_fault(vmf);
+ else
+ ret = do_shared_fault(vmf);
+
+ /* preallocated pagetable is unused: free it */
+ if (vmf->prealloc_pte) {
+ pte_free(vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = NULL;
+ }
+ return ret;
+}
+
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid,
+ int *flags)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (page_nid == numa_node_id()) {
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ *flags |= TNF_FAULT_LOCAL;
+ }
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+static vm_fault_t do_numa_page(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = NULL;
+ int page_nid = NUMA_NO_NODE;
+ int last_cpupid;
+ int target_nid;
+ bool migrated = false;
+ pte_t pte, old_pte;
+ bool was_writable = pte_savedwrite(vmf->orig_pte);
+ int flags = 0;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ */
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto out;
+ }
+
+ /*
+ * Make it present again, Depending on how arch implementes non
+ * accessible ptes, some can allow access by kernel mode.
+ */
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (was_writable)
+ pte = pte_mkwrite(pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ page = vm_normal_page(vma, vmf->address, pte);
+ if (!page) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+
+ /*
+ * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+ * much anyway since they can be in shared cache state. This misses
+ * the case where a mapping is writable but the process never writes
+ * to it but pte_write gets cleared during protection updates and
+ * pte_dirty has unpredictable behaviour between PTE scan updates,
+ * background writeback, dirty balancing and application behaviour.
+ */
+ if (!pte_write(pte))
+ flags |= TNF_NO_GROUP;
+
+ /*
+ * Flag if the page is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ flags |= TNF_SHARED;
+
+ last_cpupid = page_cpupid_last(page);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
+ &flags);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (target_nid == NUMA_NO_NODE) {
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, vma, target_nid);
+ if (migrated) {
+ page_nid = target_nid;
+ flags |= TNF_MIGRATED;
+ } else
+ flags |= TNF_MIGRATE_FAIL;
+
+out:
+ if (page_nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, page_nid, 1, flags);
+ return 0;
+}
+
+static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
+{
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_anonymous_page(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ return VM_FAULT_FALLBACK;
+}
+
+/* `inline' is required to avoid gcc 4.1.2 build error */
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+{
+ if (vma_is_anonymous(vmf->vma)) {
+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ return handle_userfault(vmf, VM_UFFD_WP);
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ }
+ if (vmf->vma->vm_ops->huge_fault) {
+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
+
+ /* COW or write-notify handled on pte level: split pmd. */
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+
+ return VM_FAULT_FALLBACK;
+}
+
+static vm_fault_t create_huge_pud(struct vm_fault *vmf)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ goto split;
+ if (vmf->vma->vm_ops->huge_fault) {
+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
+split:
+ /* COW or write-notify not handled on PUD level: split pud.*/
+ __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+ return VM_FAULT_FALLBACK;
+}
+
+/*
+ * These routines also need to handle stuff like marking pages dirty
+ * and/or accessed for architectures that don't do it in hardware (most
+ * RISC architectures). The early dirtying is also good on the i386.
+ *
+ * There is also a hook called "update_mmu_cache()" that architectures
+ * with external mmu caches can use to update those (ie the Sparc or
+ * PowerPC hashed page tables that act as extended TLBs).
+ *
+ * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
+ * concurrent faults).
+ *
+ * The mmap_lock may have been released depending on flags and our return value.
+ * See filemap_fault() and __lock_page_or_retry().
+ */
+static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
+{
+ pte_t entry;
+
+ if (unlikely(pmd_none(*vmf->pmd))) {
+ /*
+ * Leave __pte_alloc() until later: because vm_ops->fault may
+ * want to allocate huge page, and if we expose page table
+ * for an instant, it will be difficult to retract from
+ * concurrent faults and from rmap lookups.
+ */
+ vmf->pte = NULL;
+ } else {
+ /* See comment in pte_alloc_one_map() */
+ if (pmd_devmap_trans_unstable(vmf->pmd))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge
+ * pmd from under us anymore at this point because we hold the
+ * mmap_lock read mode and khugepaged takes it in write mode.
+ * So now it's safe to run pte_offset_map().
+ */
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
+
+ /*
+ * some architectures can have larger ptes than wordsize,
+ * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
+ * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
+ * accesses. The code below just needs a consistent view
+ * for the ifs and we later double check anyway with the
+ * ptl lock held. So here a barrier will do.
+ */
+ barrier();
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
+ }
+ }
+
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
+ else
+ return do_fault(vmf);
+ }
+
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
+
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
+
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry))) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
+ goto unlock;
+ }
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (!pte_write(entry))
+ return do_wp_page(vmf);
+ entry = pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ } else {
+ /* Skip spurious TLB flush for retried page fault */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ goto unlock;
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
+ }
+unlock:
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ *
+ * The mmap_lock may have been released depending on flags and our
+ * return value. See filemap_fault() and __lock_page_or_retry().
+ */
+static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address & PAGE_MASK,
+ .flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
+ };
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ vm_fault_t ret;
+
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return VM_FAULT_OOM;
+
+ vmf.pud = pud_alloc(mm, p4d, address);
+ if (!vmf.pud)
+ return VM_FAULT_OOM;
+retry_pud:
+ if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pud(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ pud_t orig_pud = *vmf.pud;
+
+ barrier();
+ if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+
+ /* NUMA case for anonymous PUDs would go here */
+
+ if (dirty && !pud_write(orig_pud)) {
+ ret = wp_huge_pud(&vmf, orig_pud);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ huge_pud_set_accessed(&vmf, orig_pud);
+ return 0;
+ }
+ }
+ }
+
+ vmf.pmd = pmd_alloc(mm, vmf.pud, address);
+ if (!vmf.pmd)
+ return VM_FAULT_OOM;
+
+ /* Huge pud page fault raced with pmd_alloc? */
+ if (pud_trans_unstable(vmf.pud))
+ goto retry_pud;
+
+ if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ pmd_t orig_pmd = *vmf.pmd;
+
+ barrier();
+ if (unlikely(is_swap_pmd(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ if (is_pmd_migration_entry(orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
+ if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+ if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
+ ret = wp_huge_pmd(&vmf, orig_pmd);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ huge_pmd_set_accessed(&vmf, orig_pmd);
+ return 0;
+ }
+ }
+ }
+
+ return handle_pte_fault(&vmf);
+}
+
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
+/*
+ * By the time we get here, we already hold the mm semaphore
+ *
+ * The mmap_lock may have been released depending on flags and our
+ * return value. See filemap_fault() and __lock_page_or_retry().
+ */
+vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, struct pt_regs *regs)
+{
+ vm_fault_t ret;
+
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGFAULT);
+
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
+ /*
+ * Enable the memcg OOM handling for faults triggered in user
+ * space. Kernel faults are handled more gracefully.
+ */
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_enter_user_fault();
+
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
+ else
+ ret = __handle_mm_fault(vma, address, flags);
+
+ if (flags & FAULT_FLAG_USER) {
+ mem_cgroup_exit_user_fault();
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ }
+
+ mm_account_fault(regs, address, flags, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(handle_mm_fault);
+
+#ifndef __PAGETABLE_P4D_FOLDED
+/*
+ * Allocate p4d page table.
+ * We've already handled the fast-path in-line.
+ */
+int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+ p4d_t *new = p4d_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
+ p4d_free(mm, new);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#endif /* __PAGETABLE_P4D_FOLDED */
+
+#ifndef __PAGETABLE_PUD_FOLDED
+/*
+ * Allocate page upper directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
+{
+ pud_t *new = pud_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&mm->page_table_lock);
+ if (!p4d_present(*p4d)) {
+ mm_inc_nr_puds(mm);
+ p4d_populate(mm, p4d, new);
+ } else /* Another has populated it */
+ pud_free(mm, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#endif /* __PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_PMD_FOLDED
+/*
+ * Allocate page middle directory.
+ * We've already handled the fast-path in-line.
+ */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *new = pmd_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ ptl = pud_lock(mm, pud);
+ if (!pud_present(*pud)) {
+ mm_inc_nr_pmds(mm);
+ pud_populate(mm, pud, new);
+ } else /* Another has populated it */
+ pmd_free(mm, new);
+ spin_unlock(ptl);
+ return 0;
+}
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp,
+ pmd_t **pmdpp, spinlock_t **ptlp)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto out;
+
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+ goto out;
+
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+
+ if (pmd_huge(*pmd)) {
+ if (!pmdpp)
+ goto out;
+
+ if (range) {
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_invalidate_range_start(range);
+ }
+ *ptlp = pmd_lock(mm, pmd);
+ if (pmd_huge(*pmd)) {
+ *pmdpp = pmd;
+ return 0;
+ }
+ spin_unlock(*ptlp);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
+ }
+
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto out;
+
+ if (range) {
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(range);
+ }
+ ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+ if (!pte_present(*ptep))
+ goto unlock;
+ *ptepp = ptep;
+ return 0;
+unlock:
+ pte_unmap_unlock(ptep, *ptlp);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
+out:
+ return -EINVAL;
+}
+
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+}
+EXPORT_SYMBOL_GPL(follow_pte);
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * This function does not allow the caller to read the permissions
+ * of the PTE. Do not use it.
+ *
+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ unsigned long *pfn)
+{
+ int ret = -EINVAL;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return ret;
+
+ ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+ if (ret)
+ return ret;
+ *pfn = pte_pfn(*ptep);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot, resource_size_t *phys)
+{
+ int ret = -EINVAL;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
+
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ goto out;
+ pte = *ptep;
+
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ ret = 0;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return ret;
+}
+
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ resource_size_t phys_addr;
+ unsigned long prot = 0;
+ void __iomem *maddr;
+ int offset = addr & (PAGE_SIZE-1);
+
+ if (follow_phys(vma, addr, write, &prot, &phys_addr))
+ return -EINVAL;
+
+ maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+ if (!maddr)
+ return -ENOMEM;
+
+ if (write)
+ memcpy_toio(maddr + offset, buf, len);
+ else
+ memcpy_fromio(buf, maddr + offset, len);
+ iounmap(maddr);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(generic_access_phys);
+#endif
+
+/*
+ * Access another process' address space as given in mm. If non-NULL, use the
+ * given task for page fault accounting.
+ */
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, unsigned int gup_flags)
+{
+ struct vm_area_struct *vma;
+ void *old_buf = buf;
+ int write = gup_flags & FOLL_WRITE;
+
+ if (mmap_read_lock_killable(mm))
+ return 0;
+
+ /* ignore errors, just check how much was successfully transferred */
+ while (len) {
+ int bytes, ret, offset;
+ void *maddr;
+ struct page *page = NULL;
+
+ ret = get_user_pages_remote(mm, addr, 1,
+ gup_flags, &page, &vma, NULL);
+ if (ret <= 0) {
+#ifndef CONFIG_HAVE_IOREMAP_PROT
+ break;
+#else
+ /*
+ * Check if this is a VM_IO | VM_PFNMAP VMA, which
+ * we can access using slightly different code.
+ */
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ break;
+ if (vma->vm_ops && vma->vm_ops->access)
+ ret = vma->vm_ops->access(vma, addr, buf,
+ len, write);
+ if (ret <= 0)
+ break;
+ bytes = ret;
+#endif
+ } else {
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ put_page(page);
+ }
+ len -= bytes;
+ buf += bytes;
+ addr += bytes;
+ }
+ mmap_read_unlock(mm);
+
+ return buf - old_buf;
+}
+
+/**
+ * access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from source to destination.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(access_process_vm);
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ /*
+ * we might be running from an atomic context so we cannot sleep
+ */
+ if (!mmap_read_trylock(mm))
+ return;
+
+ vma = find_vma(mm, ip);
+ if (vma && vma->vm_file) {
+ struct file *f = vma->vm_file;
+ char *buf = (char *)__get_free_page(GFP_NOWAIT);
+ if (buf) {
+ char *p;
+
+ p = file_path(f, buf, PAGE_SIZE);
+ if (IS_ERR(p))
+ p = "?";
+ printk("%s%s[%lx+%lx]", prefix, kbasename(p),
+ vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ free_page((unsigned long)buf);
+ }
+ }
+ mmap_read_unlock(mm);
+}
+
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
+void __might_fault(const char *file, int line)
+{
+ /*
+ * Some code (nfs/sunrpc) uses socket ops on kernel memory while
+ * holding the mmap_lock, this is safe because kernel memory doesn't
+ * get paged out, therefore we'll never actually fault, and the
+ * below annotations will generate false positives.
+ */
+ if (uaccess_kernel())
+ return;
+ if (pagefault_disabled())
+ return;
+ __might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
+ if (current->mm)
+ might_lock_read(&current->mm->mmap_lock);
+#endif
+}
+EXPORT_SYMBOL(__might_fault);
+#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+/*
+ * Process all subpages of the specified huge page with the specified
+ * operation. The target subpage will be processed last to keep its
+ * cache lines hot.
+ */
+static inline void process_huge_page(
+ unsigned long addr_hint, unsigned int pages_per_huge_page,
+ void (*process_subpage)(unsigned long addr, int idx, void *arg),
+ void *arg)
+{
+ int i, n, base, l;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+ /* Process target subpage last to keep its cache lines hot */
+ might_sleep();
+ n = (addr_hint - addr) / PAGE_SIZE;
+ if (2 * n <= pages_per_huge_page) {
+ /* If target subpage in first half of huge page */
+ base = 0;
+ l = n;
+ /* Process subpages at the end of huge page */
+ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+ cond_resched();
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
+ }
+ } else {
+ /* If target subpage in second half of huge page */
+ base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+ l = pages_per_huge_page - n;
+ /* Process subpages at the begin of huge page */
+ for (i = 0; i < base; i++) {
+ cond_resched();
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
+ }
+ }
+ /*
+ * Process remaining subpages in left-right-left-right pattern
+ * towards the target subpage
+ */
+ for (i = 0; i < l; i++) {
+ int left_idx = base + i;
+ int right_idx = base + 2 * l - 1 - i;
+
+ cond_resched();
+ process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+ cond_resched();
+ process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+ }
+}
+
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+
+static void clear_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct page *page = arg;
+
+ clear_user_highpage(page + idx, addr);
+}
+
+void clear_huge_page(struct page *page,
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
+{
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+struct copy_subpage_arg {
+ struct page *dst;
+ struct page *src;
+ struct vm_area_struct *vma;
+};
+
+static void copy_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct copy_subpage_arg *copy_arg = arg;
+
+ copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+ addr, copy_arg->vma);
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr_hint, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+ struct copy_subpage_arg arg = {
+ .dst = dst,
+ .src = src,
+ .vma = vma,
+ };
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
+}
+
+long copy_huge_page_from_user(struct page *dst_page,
+ const void __user *usr_src,
+ unsigned int pages_per_huge_page,
+ bool allow_pagefault)
+{
+ void *src = (void *)usr_src;
+ void *page_kaddr;
+ unsigned long i, rc = 0;
+ unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ struct page *subpage = dst_page;
+
+ for (i = 0; i < pages_per_huge_page;
+ i++, subpage = mem_map_next(subpage, dst_page, i)) {
+ if (allow_pagefault)
+ page_kaddr = kmap(subpage);
+ else
+ page_kaddr = kmap_atomic(subpage);
+ rc = copy_from_user(page_kaddr,
+ (const void __user *)(src + i * PAGE_SIZE),
+ PAGE_SIZE);
+ if (allow_pagefault)
+ kunmap(subpage);
+ else
+ kunmap_atomic(page_kaddr);
+
+ ret_val -= (PAGE_SIZE - rc);
+ if (rc)
+ break;
+
+ flush_dcache_page(subpage);
+
+ cond_resched();
+ }
+ return ret_val;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+
+static struct kmem_cache *page_ptl_cachep;
+
+void __init ptlock_cache_init(void)
+{
+ page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+ SLAB_PANIC, NULL);
+}
+
+bool ptlock_alloc(struct page *page)
+{
+ spinlock_t *ptl;
+
+ ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
+ if (!ptl)
+ return false;
+ page->ptl = ptl;
+ return true;
+}
+
+void ptlock_free(struct page *page)
+{
+ kmem_cache_free(page_ptl_cachep, page->ptl);
+}
+#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
new file mode 100644
index 000000000..553b0705d
--- /dev/null
+++ b/mm/memory_hotplug.c
@@ -0,0 +1,1906 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/memory_hotplug.c
+ *
+ * Copyright (C)
+ */
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memremap.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/mm_inline.h>
+#include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
+#include <linux/compaction.h>
+#include <linux/rmap.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+#include "shuffle.h"
+
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+
+static online_page_callback_t online_page_callback = generic_online_page;
+static DEFINE_MUTEX(online_page_callback_lock);
+
+DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
+
+void get_online_mems(void)
+{
+ percpu_down_read(&mem_hotplug_lock);
+}
+
+void put_online_mems(void)
+{
+ percpu_up_read(&mem_hotplug_lock);
+}
+
+bool movable_node_enabled = false;
+
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
+int memhp_default_online_type = MMOP_OFFLINE;
+#else
+int memhp_default_online_type = MMOP_ONLINE;
+#endif
+
+static int __init setup_memhp_default_state(char *str)
+{
+ const int online_type = memhp_online_type_from_str(str);
+
+ if (online_type >= 0)
+ memhp_default_online_type = online_type;
+
+ return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
+void mem_hotplug_begin(void)
+{
+ cpus_read_lock();
+ percpu_down_write(&mem_hotplug_lock);
+}
+
+void mem_hotplug_done(void)
+{
+ percpu_up_write(&mem_hotplug_lock);
+ cpus_read_unlock();
+}
+
+u64 max_mem_size = U64_MAX;
+
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size,
+ const char *resource_name)
+{
+ struct resource *res;
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+ if (strcmp(resource_name, "System RAM"))
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
+
+ /*
+ * Make sure value parsed from 'mem=' only restricts memory adding
+ * while booting, so that memory hotplug won't be impacted. Please
+ * refer to document of 'mem=' in kernel-parameters.txt for more
+ * details.
+ */
+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
+ return ERR_PTR(-E2BIG);
+
+ /*
+ * Request ownership of the new memory range. This might be
+ * a child of an existing resource that was present but
+ * not marked as busy.
+ */
+ res = __request_region(&iomem_resource, start, size,
+ resource_name, flags);
+
+ if (!res) {
+ pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
+ start, start + size);
+ return ERR_PTR(-EEXIST);
+ }
+ return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ release_resource(res);
+ kfree(res);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
+{
+ page->freelist = (void *)type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type;
+
+ type = (unsigned long) page->freelist;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ free_reserved_page(page);
+ }
+}
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
+
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+ const char *reason)
+{
+ /*
+ * Disallow all operations smaller than a sub-section and only
+ * allow operations smaller than a section for
+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+ * enforces a larger memory_block_size_bytes() granularity for
+ * memory that will be marked online, so this check should only
+ * fire for direct arch_{add,remove}_memory() users outside of
+ * add_memory_resource().
+ */
+ unsigned long min_align;
+
+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ min_align = PAGES_PER_SUBSECTION;
+ else
+ min_align = PAGES_PER_SECTION;
+ if (!IS_ALIGNED(pfn, min_align)
+ || !IS_ALIGNED(nr_pages, min_align)) {
+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+ reason, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int check_hotplug_memory_addressable(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+
+ if (max_addr >> MAX_PHYSMEM_BITS) {
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
+ WARN(1,
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+ struct mhp_params *params)
+{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
+ int err;
+ struct vmem_altmap *altmap = params->altmap;
+
+ if (WARN_ON_ONCE(!params->pgprot.pgprot))
+ return -EINVAL;
+
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
+ if (err)
+ return err;
+
+ if (altmap) {
+ /*
+ * Validate altmap is within bounds of the total request
+ */
+ if (altmap->base_pfn != pfn
+ || vmem_altmap_offset(altmap) > nr_pages) {
+ pr_warn_once("memory add fail, invalid altmap\n");
+ return -EINVAL;
+ }
+ altmap->alloc = 0;
+ }
+
+ err = check_pfn_span(pfn, nr_pages, "add");
+ if (err)
+ return err;
+
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
+ if (err)
+ break;
+ cond_resched();
+ }
+ vmemmap_populate_print_last();
+ return err;
+}
+
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_to_online_page(start_pfn)))
+ continue;
+
+ if (unlikely(pfn_to_nid(start_pfn) != nid))
+ continue;
+
+ if (zone != page_zone(pfn_to_page(start_pfn)))
+ continue;
+
+ return start_pfn;
+ }
+
+ return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ /* pfn is the end pfn of a memory section. */
+ pfn = end_pfn - 1;
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_to_online_page(pfn)))
+ continue;
+
+ if (unlikely(pfn_to_nid(pfn) != nid))
+ continue;
+
+ if (zone != page_zone(pfn_to_page(pfn)))
+ continue;
+
+ return pfn;
+ }
+
+ return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ int nid = zone_to_nid(zone);
+
+ zone_span_writelock(zone);
+ if (zone->zone_start_pfn == start_pfn) {
+ /*
+ * If the section is smallest section in the zone, it need
+ * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+ * In this case, we find second smallest valid mem_section
+ * for shrinking zone.
+ */
+ pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+ zone_end_pfn(zone));
+ if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
+ zone->zone_start_pfn = pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
+ } else if (zone_end_pfn(zone) == end_pfn) {
+ /*
+ * If the section is biggest section in the zone, it need
+ * shrink zone->spanned_pages.
+ * In this case, we find second biggest valid mem_section for
+ * shrinking zone.
+ */
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
+ start_pfn);
+ if (pfn)
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
+ }
+ zone_span_writeunlock(zone);
+}
+
+static void update_pgdat_span(struct pglist_data *pgdat)
+{
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
+
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
+
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
+ continue;
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
+ continue;
+ }
+
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
+ }
+
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
+}
+
+void __ref remove_pfn_range_from_zone(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long pfn, cur_nr_pages, flags;
+
+ /* Poison struct pages because they are now uninitialized again. */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
+ cond_resched();
+
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages =
+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
+ page_init_poison(pfn_to_page(pfn),
+ sizeof(struct page) * cur_nr_pages);
+ }
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
+ clear_zone_contiguous(zone);
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ set_zone_contiguous(zone);
+}
+
+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
+
+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
+}
+
+/**
+ * __remove_pages() - remove sections of pages
+ * @pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ * @altmap: alternative device page map or %NULL if default memmap is used
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
+ unsigned long map_offset = 0;
+
+ map_offset = vmem_altmap_offset(altmap);
+
+ if (check_pfn_span(pfn, nr_pages, "remove"))
+ return;
+
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
+ cond_resched();
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
+ map_offset = 0;
+ }
+}
+
+int set_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
+
+ if (online_page_callback == generic_online_page) {
+ online_page_callback = callback;
+ rc = 0;
+ }
+
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+
+int restore_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
+
+ if (online_page_callback == callback) {
+ online_page_callback = generic_online_page;
+ rc = 0;
+ }
+
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+
+void generic_online_page(struct page *page, unsigned int order)
+{
+ /*
+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
+ * so we should map it first. This is better than introducing a special
+ * case in page freeing fast path.
+ */
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
+ __free_pages_core(page, order);
+ totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(1UL << order);
+#endif
+}
+EXPORT_SYMBOL_GPL(generic_online_page);
+
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+
+ /*
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
+}
+
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ int nid = zone_to_nid(zone);
+
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
+
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
+ arg->status_change_nid_normal = nid;
+#ifdef CONFIG_HIGHMEM
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
+ arg->status_change_nid_high = nid;
+#endif
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_set_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_set_state(node, N_HIGH_MEMORY);
+
+ if (arg->status_change_nid >= 0)
+ node_set_state(node, N_MEMORY);
+}
+
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = zone_end_pfn(zone);
+
+ if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
+
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
+
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nid = pgdat->node_id;
+ unsigned long flags;
+
+ clear_zone_contiguous(zone);
+
+ /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+ pgdat_resize_lock(pgdat, &flags);
+ zone_span_writelock(zone);
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
+ resize_zone_range(zone, start_pfn, nr_pages);
+ zone_span_writeunlock(zone);
+ resize_pgdat_range(pgdat, start_pfn, nr_pages);
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /*
+ * TODO now we have a visible range of pages which are not associated
+ * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * expects the zone spans the pfn range. All the pages in the range
+ * are reserved so nobody should be touching them so we should be safe
+ */
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+ MEMINIT_HOTPLUG, altmap, migratetype);
+
+ set_zone_contiguous(zone);
+}
+
+/*
+ * Returns a default kernel memory zone for the given pfn range.
+ * If no kernel zone covers this pfn range it will automatically go
+ * to the ZONE_NORMAL.
+ */
+static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int zid;
+
+ for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_intersects(zone, start_pfn, nr_pages))
+ return zone;
+ }
+
+ return &pgdat->node_zones[ZONE_NORMAL];
+}
+
+static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
+ nr_pages);
+ struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+ bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
+ bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
+
+ /*
+ * We inherit the existing zone in a simple case where zones do not
+ * overlap in the given range
+ */
+ if (in_kernel ^ in_movable)
+ return (in_kernel) ? kernel_zone : movable_zone;
+
+ /*
+ * If the range doesn't belong to any zone or two zones overlap in the
+ * given range then we use movable zone only if movable_node is
+ * enabled because we always online to a kernel zone by default.
+ */
+ return movable_node_enabled ? movable_zone : kernel_zone;
+}
+
+struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ if (online_type == MMOP_ONLINE_KERNEL)
+ return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_MOVABLE)
+ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+
+ return default_zone_for_pfn(nid, start_pfn, nr_pages);
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ int online_type, int nid)
+{
+ unsigned long flags;
+ struct zone *zone;
+ int need_zonelists_rebuild = 0;
+ int ret;
+ struct memory_notify arg;
+
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
+ mem_hotplug_begin();
+
+ /* associate pfn range with the zone */
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
+
+ arg.start_pfn = pfn;
+ arg.nr_pages = nr_pages;
+ node_states_check_changes_online(nr_pages, zone, &arg);
+
+ ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+
+ /*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
+ * If this zone is not populated, then it is not in zonelist.
+ * This means the page allocator ignores this zone.
+ * So, zonelist must be updated after online.
+ */
+ if (!populated_zone(zone)) {
+ need_zonelists_rebuild = 1;
+ setup_zone_pageset(zone);
+ }
+
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages += nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ zone_pcp_update(zone);
+
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
+ /*
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
+ */
+ shuffle_zone(zone);
+
+ init_per_zone_wmark_min();
+
+ kswapd_run(nid);
+ kcompactd_run(nid);
+
+ writeback_set_ratelimit();
+
+ memory_notify(MEM_ONLINE, &arg);
+ mem_hotplug_done();
+ return 0;
+
+failed_addition:
+ pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ remove_pfn_range_from_zone(zone, pfn, nr_pages);
+ mem_hotplug_done();
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+
+static void reset_node_present_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ z->present_pages = 0;
+
+ pgdat->node_present_pages = 0;
+}
+
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
+{
+ struct pglist_data *pgdat;
+
+ pgdat = NODE_DATA(nid);
+ if (!pgdat) {
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ return NULL;
+
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
+ arch_refresh_nodedata(nid, pgdat);
+ } else {
+ int cpu;
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+ }
+
+ /* we can use NODE_DATA(nid) from here */
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = 0;
+
+ /* init node's zones as empty zones, we don't have any present pages.*/
+ free_area_init_core_hotplug(nid);
+
+ /*
+ * The node we allocated has no zone fallback lists. For avoiding
+ * to access not-initialized zonelist, build here.
+ */
+ build_all_zonelists(pgdat);
+
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages because they will be updated in
+ * online_pages() and offline_pages().
+ */
+ reset_node_managed_pages(pgdat);
+ reset_node_present_pages(pgdat);
+
+ return pgdat;
+}
+
+static void rollback_node_hotadd(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ arch_refresh_nodedata(nid, NULL);
+ free_percpu(pgdat->per_cpu_nodestats);
+ arch_free_nodedata(pgdat);
+}
+
+
+/**
+ * try_online_node - online a node if offlined
+ * @nid: the node ID
+ * @set_node_online: Whether we want to online the node
+ * called by cpu_up() to online a node without onlined memory.
+ *
+ * Returns:
+ * 1 -> a new node has been allocated
+ * 0 -> the node is already online
+ * -ENOMEM -> the node could not be allocated
+ */
+static int __try_online_node(int nid, bool set_node_online)
+{
+ pg_data_t *pgdat;
+ int ret = 1;
+
+ if (node_online(nid))
+ return 0;
+
+ pgdat = hotadd_new_pgdat(nid);
+ if (!pgdat) {
+ pr_err("Cannot online node %d due to NULL pgdat\n", nid);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (set_node_online) {
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+ }
+out:
+ return ret;
+}
+
+/*
+ * Users of this function always want to online/register the node
+ */
+int try_online_node(int nid)
+{
+ int ret;
+
+ mem_hotplug_begin();
+ ret = __try_online_node(nid, true);
+ mem_hotplug_done();
+ return ret;
+}
+
+static int check_hotplug_memory_range(u64 start, u64 size)
+{
+ /* memory range must be block size aligned */
+ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes())) {
+ pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
+ memory_block_size_bytes(), start, size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ mem->online_type = memhp_default_online_type;
+ return device_online(&mem->dev);
+}
+
+/*
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations (triggered e.g. by sysfs).
+ *
+ * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
+ */
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
+{
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ u64 start, size;
+ bool new_node = false;
+ int ret;
+
+ start = res->start;
+ size = resource_size(res);
+
+ ret = check_hotplug_memory_range(start, size);
+ if (ret)
+ return ret;
+
+ if (!node_possible(nid)) {
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
+ return -EINVAL;
+ }
+
+ mem_hotplug_begin();
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_add_node(start, size, nid);
+
+ ret = __try_online_node(nid, false);
+ if (ret < 0)
+ goto error;
+ new_node = ret;
+
+ /* call arch's memory hotadd */
+ ret = arch_add_memory(nid, start, size, &params);
+ if (ret < 0)
+ goto error;
+
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size);
+ if (ret) {
+ arch_remove_memory(nid, start, size, NULL);
+ goto error;
+ }
+
+ if (new_node) {
+ /* If sysfs file of new node can't be created, cpu on the node
+ * can't be hot-added. There is no rollback way now.
+ * So, check by BUG_ON() to catch it reluctantly..
+ * We online node here. We can't roll back from here.
+ */
+ node_set_online(nid);
+ ret = __register_one_node(nid);
+ BUG_ON(ret);
+ }
+
+ /* link memory sections under this node.*/
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
+
+ /* create new memmap entry */
+ if (!strcmp(res->name, "System RAM"))
+ firmware_map_add_hotplug(start, start + size, "System RAM");
+
+ /* device_online() will take the lock when calling online_pages() */
+ mem_hotplug_done();
+
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
+ /* online pages if requested */
+ if (memhp_default_online_type != MMOP_OFFLINE)
+ walk_memory_blocks(start, size, NULL, online_memory_block);
+
+ return ret;
+error:
+ /* rollback pgdat allocation and others */
+ if (new_node)
+ rollback_node_hotadd(nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_remove(start, size);
+ mem_hotplug_done();
+ return ret;
+}
+
+/* requires device_hotplug_lock, see add_memory_resource() */
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
+{
+ struct resource *res;
+ int ret;
+
+ res = register_memory_resource(start, size, "System RAM");
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ ret = add_memory_resource(nid, res, mhp_flags);
+ if (ret < 0)
+ release_memory_resource(res);
+ return ret;
+}
+
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
+{
+ int rc;
+
+ lock_device_hotplug();
+ rc = __add_memory(nid, start, size, mhp_flags);
+ unlock_device_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+/*
+ * Add special, driver-managed memory to the system as system RAM. Such
+ * memory is not exposed via the raw firmware-provided memmap as system
+ * RAM, instead, it is detected and added by a driver - during cold boot,
+ * after a reboot, and after kexec.
+ *
+ * Reasons why this memory should not be used for the initial memmap of a
+ * kexec kernel or for placing kexec images:
+ * - The booting kernel is in charge of determining how this memory will be
+ * used (e.g., use persistent memory as system RAM)
+ * - Coordination with a hypervisor is required before this memory
+ * can be used (e.g., inaccessible parts).
+ *
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
+ * memory map") are created. Also, the created memory resource is flagged
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
+ * this memory as well (esp., not place kexec images onto it).
+ *
+ * The resource_name (visible via /proc/iomem) has to have the format
+ * "System RAM ($DRIVER)".
+ */
+int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name, mhp_t mhp_flags)
+{
+ struct resource *res;
+ int rc;
+
+ if (!resource_name ||
+ strstr(resource_name, "System RAM (") != resource_name ||
+ resource_name[strlen(resource_name) - 1] != ')')
+ return -EINVAL;
+
+ lock_device_hotplug();
+
+ res = register_memory_resource(start, size, resource_name);
+ if (IS_ERR(res)) {
+ rc = PTR_ERR(res);
+ goto out_unlock;
+ }
+
+ rc = add_memory_resource(nid, res, mhp_flags);
+ if (rc < 0)
+ release_memory_resource(res);
+
+out_unlock:
+ unlock_device_hotplug();
+ return rc;
+}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
+ */
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn, sec_end_pfn;
+ struct zone *zone = NULL;
+ struct page *page;
+ int i;
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
+ pfn < end_pfn;
+ pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
+ /* Make sure the memory section is present first */
+ if (!present_section_nr(pfn_to_section_nr(pfn)))
+ continue;
+ for (; pfn < sec_end_pfn && pfn < end_pfn;
+ pfn += MAX_ORDER_NR_PAGES) {
+ i = 0;
+ /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+ while ((i < MAX_ORDER_NR_PAGES) &&
+ !pfn_valid_within(pfn + i))
+ i++;
+ if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
+ continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return NULL;
+ page = pfn_to_page(pfn + i);
+ if (zone && page_zone(page) != zone)
+ return NULL;
+ zone = page_zone(page);
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
+ * non-lru movable pages and hugepages). Will skip over most unmovable
+ * pages (esp., pages that can be skipped when offlining), but bail out on
+ * definitely unmovable pages.
+ *
+ * Returns:
+ * 0 in case a movable page is found and movable_pfn was updated.
+ * -ENOENT in case no movable page was found.
+ * -EBUSY in case a definitely unmovable page was found.
+ */
+static int scan_movable_pages(unsigned long start, unsigned long end,
+ unsigned long *movable_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start; pfn < end; pfn++) {
+ struct page *page, *head;
+ unsigned long skip;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ goto found;
+ if (__PageMovable(page))
+ goto found;
+
+ /*
+ * PageOffline() pages that are not marked __PageMovable() and
+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
+ * definitely unmovable. If their reference count would be 0,
+ * they could at least be skipped when offlining memory.
+ */
+ if (PageOffline(page) && page_count(page))
+ return -EBUSY;
+
+ if (!PageHuge(page))
+ continue;
+ head = compound_head(page);
+ if (page_huge_active(head))
+ goto found;
+ skip = compound_nr(head) - (pfn - page_to_pfn(head));
+ pfn += skip - 1;
+ }
+ return -ENOENT;
+found:
+ *movable_pfn = pfn;
+ return 0;
+}
+
+static int
+do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct page *page, *head;
+ int ret = 0;
+ LIST_HEAD(source);
+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ head = compound_head(page);
+
+ if (PageHuge(page)) {
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
+ isolate_hugetlb(head, &source);
+ continue;
+ } else if (PageTransHuge(page))
+ pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
+
+ /*
+ * HWPoison pages have elevated reference counts so the migration would
+ * fail on them. It also doesn't make any sense to migrate them in the
+ * first place. Still try to unmap such a page in case it is still mapped
+ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+ * the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page, TTU_IGNORE_MLOCK);
+ continue;
+ }
+
+ if (!get_page_unless_zero(page))
+ continue;
+ /*
+ * We can skip free pages. And we can deal with pages on
+ * LRU and non-lru movable pages.
+ */
+ if (PageLRU(page))
+ ret = isolate_lru_page(page);
+ else
+ ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ if (!ret) { /* Success */
+ list_add_tail(&page->lru, &source);
+ if (!__PageMovable(page))
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
+ } else {
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
+ }
+ }
+ put_page(page);
+ }
+ if (!list_empty(&source)) {
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ if (ret) {
+ list_for_each_entry(page, &source, lru) {
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("migrating pfn %lx failed ret:%d\n",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
+ }
+ putback_movable_pages(&source);
+ }
+ }
+
+ return ret;
+}
+
+static int __init cmdline_parse_movable_node(char *p)
+{
+ movable_node_enabled = true;
+ return 0;
+}
+early_param("movable_node", cmdline_parse_movable_node);
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt;
+
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
+
+ /*
+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is within the range
+ * [0..ZONE_NORMAL], and it is the last present memory there,
+ * the zones in that range will become empty after the offlining,
+ * thus we can determine that we need to clear the node from
+ * node_states[N_NORMAL_MEMORY].
+ */
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
+ arg->status_change_nid_normal = zone_to_nid(zone);
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * node_states[N_HIGH_MEMORY] contains nodes which
+ * have normal memory or high memory.
+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
+ * we determine that the zones in that range become empty,
+ * we need to clear the node for N_HIGH_MEMORY.
+ */
+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
+ arg->status_change_nid_high = zone_to_nid(zone);
+#endif
+
+ /*
+ * We have accounted the pages from [0..ZONE_NORMAL), and
+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
+ * as well.
+ * Here we count the possible pages from ZONE_MOVABLE.
+ * If after having accounted all the pages, we see that the nr_pages
+ * to be offlined is over or equal to the accounted pages,
+ * we know that the node will become empty, and so, we can clear
+ * it for N_MEMORY as well.
+ */
+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
+
+ if (nr_pages >= present_pages)
+ arg->status_change_nid = zone_to_nid(zone);
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_clear_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_clear_state(node, N_HIGH_MEMORY);
+
+ if (arg->status_change_nid >= 0)
+ node_clear_state(node, N_MEMORY);
+}
+
+static int count_system_ram_pages_cb(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long *nr_system_ram_pages = data;
+
+ *nr_system_ram_pages += nr_pages;
+ return 0;
+}
+
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
+ unsigned long flags;
+ struct zone *zone;
+ struct memory_notify arg;
+ int ret, node;
+ char *reason;
+
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
+ mem_hotplug_begin();
+
+ /*
+ * Don't allow to offline memory blocks that contain holes.
+ * Consequently, memory blocks with holes can never get onlined
+ * via the hotplug path - online_pages() - as hotplugged memory has
+ * no holes. This way, we e.g., don't have to worry about marking
+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
+ * avoid using walk_system_ram_range() later.
+ */
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
+ count_system_ram_pages_cb);
+ if (system_ram_pages != nr_pages) {
+ ret = -EINVAL;
+ reason = "memory holes";
+ goto failed_removal;
+ }
+
+ /* This makes hotplug much easier...and readable.
+ we assume this for now. .*/
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
+ ret = -EINVAL;
+ reason = "multizone range";
+ goto failed_removal;
+ }
+ node = zone_to_nid(zone);
+
+ /* set above range as isolated */
+ ret = start_isolate_page_range(start_pfn, end_pfn,
+ MIGRATE_MOVABLE,
+ MEMORY_OFFLINE | REPORT_FAILURE);
+ if (ret) {
+ reason = "failure to isolate range";
+ goto failed_removal;
+ }
+
+ arg.start_pfn = start_pfn;
+ arg.nr_pages = nr_pages;
+ node_states_check_changes_offline(nr_pages, zone, &arg);
+
+ ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "notifier failure";
+ goto failed_removal_isolated;
+ }
+
+ do {
+ pfn = start_pfn;
+ do {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ reason = "signal backoff";
+ goto failed_removal_isolated;
+ }
+
+ cond_resched();
+ lru_add_drain_all();
+
+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
+ if (!ret) {
+ /*
+ * TODO: fatal migration failures should bail
+ * out
+ */
+ do_migrate_range(pfn, end_pfn);
+ }
+ } while (!ret);
+
+ if (ret != -ENOENT) {
+ reason = "unmovable page";
+ goto failed_removal_isolated;
+ }
+
+ /*
+ * Dissolve free hugepages in the memory block before doing
+ * offlining actually in order to make hugetlbfs's object
+ * counting consistent.
+ */
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ if (ret) {
+ reason = "failure to dissolve huge pages";
+ goto failed_removal_isolated;
+ }
+
+ /*
+ * per-cpu pages are drained in start_isolate_page_range, but if
+ * there are still pages that are not free, make sure that we
+ * drain again, because when we isolated range we might
+ * have raced with another thread that was adding pages to pcp
+ * list.
+ *
+ * Forward progress should be still guaranteed because
+ * pages on the pcp list can only belong to MOVABLE_ZONE
+ * because has_unmovable_pages explicitly checks for
+ * PageBuddy on freed pages on other zones.
+ */
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+ if (ret)
+ drain_all_pages(zone);
+ } while (ret);
+
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
+ /*
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /* removal success */
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages -= nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ init_per_zone_wmark_min();
+
+ if (!populated_zone(zone)) {
+ zone_pcp_reset(zone);
+ build_all_zonelists(NULL);
+ } else
+ zone_pcp_update(zone);
+
+ node_states_clear_node(node, &arg);
+ if (arg.status_change_nid >= 0) {
+ kswapd_stop(node);
+ kcompactd_stop(node);
+ }
+
+ writeback_set_ratelimit();
+
+ memory_notify(MEM_OFFLINE, &arg);
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+ mem_hotplug_done();
+ return 0;
+
+failed_removal_isolated:
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal:
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+ reason);
+ /* pushback to free area */
+ mem_hotplug_done();
+ return ret;
+}
+
+static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+ int ret = !is_memblock_offlined(mem);
+
+ if (unlikely(ret)) {
+ phys_addr_t beginpa, endpa;
+
+ beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+ endpa = beginpa + memory_block_size_bytes() - 1;
+ pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
+ &beginpa, &endpa);
+
+ return -EBUSY;
+ }
+ return 0;
+}
+
+static int check_cpu_on_node(pg_data_t *pgdat)
+{
+ int cpu;
+
+ for_each_present_cpu(cpu) {
+ if (cpu_to_node(cpu) == pgdat->node_id)
+ /*
+ * the cpu on this node isn't removed, and we can't
+ * offline this node.
+ */
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
+{
+ int nid = *(int *)arg;
+
+ /*
+ * If a memory block belongs to multiple nodes, the stored nid is not
+ * reliable. However, such blocks are always online (e.g., cannot get
+ * offlined) and, therefore, are still spanned by the node.
+ */
+ return mem->nid == nid ? -EEXIST : 0;
+}
+
+/**
+ * try_offline_node
+ * @nid: the node ID
+ *
+ * Offline a node if all memory sections and cpus of the node are removed.
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call.
+ */
+void try_offline_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int rc;
+
+ /*
+ * If the node still spans pages (especially ZONE_DEVICE), don't
+ * offline it. A node spans memory after move_pfn_range_to_zone(),
+ * e.g., after the memory block was onlined.
+ */
+ if (pgdat->node_spanned_pages)
+ return;
+
+ /*
+ * Especially offline memory blocks might not be spanned by the
+ * node. They will get spanned by the node once they get onlined.
+ * However, they link to the node in sysfs and can get onlined later.
+ */
+ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
+ if (rc)
+ return;
+
+ if (check_cpu_on_node(pgdat))
+ return;
+
+ /*
+ * all memory/cpu of this node are removed, we can offline this
+ * node now.
+ */
+ node_set_offline(nid);
+ unregister_one_node(nid);
+}
+EXPORT_SYMBOL(try_offline_node);
+
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
+{
+ int rc = 0;
+
+ BUG_ON(check_hotplug_memory_range(start, size));
+
+ /*
+ * All memory blocks must be offlined before removing memory. Check
+ * whether all memory blocks in question are offline and return error
+ * if this is not the case.
+ */
+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ if (rc)
+ return rc;
+
+ /* remove memmap entry */
+ firmware_map_remove(start, start + size, "System RAM");
+
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ */
+ remove_memory_block_devices(start, size);
+
+ mem_hotplug_begin();
+
+ arch_remove_memory(nid, start, size, NULL);
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ memblock_free(start, size);
+ memblock_remove(start, size);
+ }
+
+ release_mem_region_adjustable(start, size);
+
+ try_offline_node(nid);
+
+ mem_hotplug_done();
+ return 0;
+}
+
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+
+ /*
+ * trigger BUG() if some memory is not offlined prior to calling this
+ * function
+ */
+ if (try_remove_memory(nid, start, size))
+ BUG();
+}
+
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
+ lock_device_hotplug();
+ rc = try_remove_memory(nid, start, size);
+ unlock_device_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+static int try_offline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t online_type = MMOP_ONLINE_KERNEL;
+ uint8_t **online_types = arg;
+ struct page *page;
+ int rc;
+
+ /*
+ * Sense the online_type via the zone of the memory block. Offlining
+ * with multiple zones within one memory block will be rejected
+ * by offlining code ... so we don't care about that.
+ */
+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ online_type = MMOP_ONLINE_MOVABLE;
+
+ rc = device_offline(&mem->dev);
+ /*
+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
+ * so try_reonline_memory_block() can do the right thing.
+ */
+ if (!rc)
+ **online_types = online_type;
+
+ (*online_types)++;
+ /* Ignore if already offline. */
+ return rc < 0 ? rc : 0;
+}
+
+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t **online_types = arg;
+ int rc;
+
+ if (**online_types != MMOP_OFFLINE) {
+ mem->online_type = **online_types;
+ rc = device_online(&mem->dev);
+ if (rc < 0)
+ pr_warn("%s: Failed to re-online memory: %d",
+ __func__, rc);
+ }
+
+ /* Continue processing all remaining memory blocks. */
+ (*online_types)++;
+ return 0;
+}
+
+/*
+ * Try to offline and remove memory. Might take a long time to finish in case
+ * memory is still in use. Primarily useful for memory devices that logically
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
+ * that memory.
+ */
+int offline_and_remove_memory(int nid, u64 start, u64 size)
+{
+ const unsigned long mb_count = size / memory_block_size_bytes();
+ uint8_t *online_types, *tmp;
+ int rc;
+
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ return -EINVAL;
+
+ /*
+ * We'll remember the old online type of each memory block, so we can
+ * try to revert whatever we did when offlining one memory block fails
+ * after offlining some others succeeded.
+ */
+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
+ GFP_KERNEL);
+ if (!online_types)
+ return -ENOMEM;
+ /*
+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
+ * try_reonline_memory_block().
+ */
+ memset(online_types, MMOP_OFFLINE, mb_count);
+
+ lock_device_hotplug();
+
+ tmp = online_types;
+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
+
+ /*
+ * In case we succeeded to offline all memory, remove it.
+ * This cannot fail as it cannot get onlined in the meantime.
+ */
+ if (!rc) {
+ rc = try_remove_memory(nid, start, size);
+ if (rc)
+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
+ }
+
+ /*
+ * Rollback what we did. While memory onlining might theoretically fail
+ * (nacked by a notifier), it barely ever happens.
+ */
+ if (rc) {
+ tmp = online_types;
+ walk_memory_blocks(start, size, &tmp,
+ try_reonline_memory_block);
+ }
+ unlock_device_hotplug();
+
+ kfree(online_types);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
new file mode 100644
index 000000000..6c98585f2
--- /dev/null
+++ b/mm/mempolicy.c
@@ -0,0 +1,3050 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Simple NUMA memory policy for the Linux kernel.
+ *
+ * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
+ *
+ * NUMA policy allows the user to give hints in which node(s) memory should
+ * be allocated.
+ *
+ * Support four policies per VMA and per process:
+ *
+ * The VMA policy has priority over the process policy for a page fault.
+ *
+ * interleave Allocate memory interleaved over a set of nodes,
+ * with normal fallback if it fails.
+ * For VMA based allocations this interleaves based on the
+ * offset into the backing object or offset into the mapping
+ * for anonymous memory. For process policy an process counter
+ * is used.
+ *
+ * bind Only allocate memory on a specific set of nodes,
+ * no fallback.
+ * FIXME: memory is allocated starting with the first node
+ * to the last. It would be better if bind would truly restrict
+ * the allocation to memory nodes instead
+ *
+ * preferred Try a specific node first before normal fallback.
+ * As a special case NUMA_NO_NODE here means do the allocation
+ * on the local CPU. This is normally identical to default,
+ * but useful to set in a VMA when you have a non default
+ * process policy.
+ *
+ * default Allocate on the local node first, or when on a VMA
+ * use the process policy. This is what Linux always did
+ * in a NUMA aware kernel and still does by, ahem, default.
+ *
+ * The process policy is applied for most non interrupt memory allocations
+ * in that process' context. Interrupts ignore the policies and always
+ * try to allocate on the local CPU. The VMA policy is only applied for memory
+ * allocations for a VMA in the VM.
+ *
+ * Currently there are a few corner cases in swapping where the policy
+ * is not applied, but the majority should be handled. When process policy
+ * is used it is not remembered over swap outs/swap ins.
+ *
+ * Only the highest zone in the zone hierarchy gets policied. Allocations
+ * requesting a lower zone just use default policy. This implies that
+ * on systems with highmem kernel lowmem allocation don't get policied.
+ * Same with GFP_DMA allocations.
+ *
+ * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * all users and remembered even when nobody has memory mapped.
+ */
+
+/* Notebook:
+ fix mmap readahead to honour policy and enable policy for any page cache
+ object
+ statistics for bigpages
+ global policy for page cache? currently it uses process policy. Requires
+ first item above.
+ handle mremap for shared memory (currently ignored for the policy)
+ grows down?
+ make bind policy root only? It can trigger oom much faster and the
+ kernel is not always grateful with that.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mempolicy.h>
+#include <linux/pagewalk.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
+#include <linux/nodemask.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/nsproxy.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/ptrace.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ctype.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
+#include <linux/printk.h>
+#include <linux/swapops.h>
+
+#include <asm/tlbflush.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
+
+/* Highest zone. An specific allocation for a zone below that is not
+ policied. */
+enum zone_type policy_zone = 0;
+
+/*
+ * run-time system-wide default policy => local allocation
+ */
+static struct mempolicy default_policy = {
+ .refcnt = ATOMIC_INIT(1), /* never free it */
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL,
+};
+
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @node: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+ int min_dist = INT_MAX, dist, n, min_node;
+
+ if (node == NUMA_NO_NODE || node_online(node))
+ return node;
+
+ min_node = node;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+
+struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (pol)
+ return pol;
+
+ node = numa_node_id();
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
+ /* preferred_node_policy is not initialised early in boot */
+ if (pol->mode)
+ return pol;
+ }
+
+ return &default_policy;
+}
+
+static const struct mempolicy_operations {
+ int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
+
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+ return pol->flags & MPOL_MODE_FLAGS;
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t tmp;
+ nodes_fold(tmp, *orig, nodes_weight(*rel));
+ nodes_onto(*ret, tmp, *rel);
+}
+
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!nodes)
+ pol->flags |= MPOL_F_LOCAL; /* local allocation */
+ else if (nodes_empty(*nodes))
+ return -EINVAL; /* no allowed nodes */
+ else
+ pol->v.preferred_node = first_node(*nodes);
+ return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+/*
+ * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
+ * any, for the new policy. mpol_new() has already validated the nodes
+ * parameter with respect to the policy mode and flags. But, we need to
+ * handle an empty nodemask with MPOL_PREFERRED here.
+ *
+ * Must be called holding task's alloc_lock to protect task's mems_allowed
+ * and mempolicy. May also be called holding the mmap_lock for write.
+ */
+static int mpol_set_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes, struct nodemask_scratch *nsc)
+{
+ int ret;
+
+ /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
+ if (pol == NULL)
+ return 0;
+ /* Check N_MEMORY */
+ nodes_and(nsc->mask1,
+ cpuset_current_mems_allowed, node_states[N_MEMORY]);
+
+ VM_BUG_ON(!nodes);
+ if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
+ nodes = NULL; /* explicit local allocation */
+ else {
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
+
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
+ else
+ pol->w.cpuset_mems_allowed =
+ cpuset_current_mems_allowed;
+ }
+
+ if (nodes)
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ else
+ ret = mpol_ops[pol->mode].create(pol, NULL);
+ return ret;
+}
+
+/*
+ * This function just creates a new policy, does some check and simple
+ * initialization. You must invoke mpol_set_nodemask() to set nodes.
+ */
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
+{
+ struct mempolicy *policy;
+
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
+
+ if (mode == MPOL_DEFAULT) {
+ if (nodes && !nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ return NULL;
+ }
+ VM_BUG_ON(!nodes);
+
+ /*
+ * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+ * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+ * All other modes require a valid pointer to a non-empty nodemask.
+ */
+ if (mode == MPOL_PREFERRED) {
+ if (nodes_empty(*nodes)) {
+ if (((flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES)))
+ return ERR_PTR(-EINVAL);
+ }
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes) ||
+ (flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
+ } else if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!policy)
+ return ERR_PTR(-ENOMEM);
+ atomic_set(&policy->refcnt, 1);
+ policy->mode = mode;
+ policy->flags = flags;
+
+ return policy;
+}
+
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+ if (!atomic_dec_and_test(&p->refcnt))
+ return;
+ kmem_cache_free(policy_cache, p);
+}
+
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES)
+ nodes_and(tmp, pol->w.user_nodemask, *nodes);
+ else if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ else {
+ nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+
+ if (nodes_empty(tmp))
+ tmp = *nodes;
+
+ pol->v.nodes = tmp;
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+ const nodemask_t *nodes)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES) {
+ int node = first_node(pol->w.user_nodemask);
+
+ if (node_isset(node, *nodes)) {
+ pol->v.preferred_node = node;
+ pol->flags &= ~MPOL_F_LOCAL;
+ } else
+ pol->flags |= MPOL_F_LOCAL;
+ } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ pol->v.preferred_node = first_node(tmp);
+ } else if (!(pol->flags & MPOL_F_LOCAL)) {
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+}
+
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+{
+ if (!pol || pol->mode == MPOL_LOCAL)
+ return;
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
+ nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+ return;
+
+ mpol_ops[pol->mode].rebind(pol, newmask);
+}
+
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ *
+ * Called with task's alloc_lock held.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+ mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+ struct vm_area_struct *vma;
+
+ mmap_write_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new);
+ mmap_write_unlock(mm);
+}
+
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+ [MPOL_DEFAULT] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_INTERLEAVE] = {
+ .create = mpol_new_interleave,
+ .rebind = mpol_rebind_nodemask,
+ },
+ [MPOL_PREFERRED] = {
+ .create = mpol_new_preferred,
+ .rebind = mpol_rebind_preferred,
+ },
+ [MPOL_BIND] = {
+ .create = mpol_new_bind,
+ .rebind = mpol_rebind_nodemask,
+ },
+};
+
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
+
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ unsigned long start;
+ unsigned long end;
+ struct vm_area_struct *first;
+};
+
+/*
+ * Check if the page's nid is in qp->nmask.
+ *
+ * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
+ * in the invert of qp->nmask.
+ */
+static inline bool queue_pages_required(struct page *page,
+ struct queue_pages *qp)
+{
+ int nid = page_to_nid(page);
+ unsigned long flags = qp->flags;
+
+ return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
+}
+
+/*
+ * queue_pages_pmd() has four possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 2 - THP was split.
+ * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
+ * existing page was already on a node that does not follow the
+ * policy.
+ */
+static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+ __releases(ptl)
+{
+ int ret = 0;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags;
+
+ if (unlikely(is_pmd_migration_entry(*pmd))) {
+ ret = -EIO;
+ goto unlock;
+ }
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+ ret = 2;
+ goto out;
+ }
+ if (!queue_pages_required(page, qp))
+ goto unlock;
+
+ flags = qp->flags;
+ /* go to thp migration */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(walk->vma) ||
+ migrate_page_add(page, qp->pagelist, flags)) {
+ ret = 1;
+ goto unlock;
+ }
+ } else
+ ret = -EIO;
+unlock:
+ spin_unlock(ptl);
+out:
+ return ret;
+}
+
+/*
+ * Scan through pages checking if pages follow certain conditions,
+ * and move them to the pagelist if they do.
+ *
+ * queue_pages_pte_range() has three possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ * on a node that does not follow the policy.
+ */
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ int ret;
+ bool has_unmovable = false;
+ pte_t *pte, *mapped_pte;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
+ if (ret != 2)
+ return ret;
+ }
+ /* THP was split, fall through to pte walk */
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ if (!pte_present(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ continue;
+ /*
+ * vm_normal_page() filters out zero pages, but there might
+ * still be PageReserved pages to skip, perhaps in a VDSO.
+ */
+ if (PageReserved(page))
+ continue;
+ if (!queue_pages_required(page, qp))
+ continue;
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ /* MPOL_MF_STRICT must be specified if we get here */
+ if (!vma_migratable(vma)) {
+ has_unmovable = true;
+ break;
+ }
+
+ /*
+ * Do not abort immediately since there may be
+ * temporary off LRU pages in the range. Still
+ * need migrate other LRU pages.
+ */
+ if (migrate_page_add(page, qp->pagelist, flags))
+ has_unmovable = true;
+ } else
+ break;
+ }
+ pte_unmap_unlock(mapped_pte, ptl);
+ cond_resched();
+
+ if (has_unmovable)
+ return 1;
+
+ return addr != end ? -EIO : 0;
+}
+
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ goto unlock;
+ page = pte_page(entry);
+ if (!queue_pages_required(page, qp))
+ goto unlock;
+
+ if (flags == MPOL_MF_STRICT) {
+ /*
+ * STRICT alone means only detecting misplaced page and no
+ * need to further check other vma.
+ */
+ ret = -EIO;
+ goto unlock;
+ }
+
+ if (!vma_migratable(walk->vma)) {
+ /*
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
+ * stopped walking current vma.
+ * Detecting misplaced page but allow migrating pages which
+ * have been queued.
+ */
+ ret = 1;
+ goto unlock;
+ }
+
+ /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+ if (flags & (MPOL_MF_MOVE_ALL) ||
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
+ !hugetlb_pmd_shared(pte))) {
+ if (isolate_hugetlb(page, qp->pagelist) &&
+ (flags & MPOL_MF_STRICT))
+ /*
+ * Failed to isolate page but allow migrating pages
+ * which have been queued.
+ */
+ ret = 1;
+ }
+unlock:
+ spin_unlock(ptl);
+#else
+ BUG();
+#endif
+ return ret;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long endvma = vma->vm_end;
+ unsigned long flags = qp->flags;
+
+ /* range check first */
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+
+ if (!qp->first) {
+ qp->first = vma;
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ (qp->start < vma->vm_start))
+ /* hole at head side of range */
+ return -EFAULT;
+ }
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ ((vma->vm_end < qp->end) &&
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ /* hole at middle or tail of range */
+ return -EFAULT;
+
+ /*
+ * Need check MPOL_MF_STRICT to return -EIO if possible
+ * regardless of vma_migratable
+ */
+ if (!vma_migratable(vma) &&
+ !(flags & MPOL_MF_STRICT))
+ return 1;
+
+ if (endvma > end)
+ endvma = end;
+
+ if (flags & MPOL_MF_LAZY) {
+ /* Similar to task_numa_work, skip inaccessible VMAs */
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
+ change_prot_numa(vma, start, endvma);
+ return 1;
+ }
+
+ /* queue pages from current vma */
+ if (flags & MPOL_MF_VALID)
+ return 0;
+ return 1;
+}
+
+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+};
+
+/*
+ * Walk through page tables and collect pages to be migrated.
+ *
+ * If pages found in a given range are on a set of nodes (determined by
+ * @nodes and @flags,) it's isolated and queued to the pagelist which is
+ * passed via @private.
+ *
+ * queue_pages_range() has three possible return values:
+ * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 0 - queue pages successfully or no misplaced page.
+ * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
+ * memory range specified by nodemask and maxnode points outside
+ * your accessible address space (-EFAULT)
+ */
+static int
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
+{
+ int err;
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .start = start,
+ .end = end,
+ .first = NULL,
+ };
+
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+
+ if (!qp.first)
+ /* whole range in hole */
+ err = -EFAULT;
+
+ return err;
+}
+
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_lock held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
+{
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+ if (err)
+ goto err_out;
+ }
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_lock */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
+ return err;
+}
+
+/* Step 2: apply policy to a range and do splits. */
+static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
+{
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ int err = 0;
+ pgoff_t pgoff;
+ unsigned long vmstart;
+ unsigned long vmend;
+
+ vma = find_vma(mm, start);
+ VM_BUG_ON(!vma);
+
+ prev = vma->vm_prev;
+ if (start > vma->vm_start)
+ prev = vma;
+
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
+
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+
+ pgoff = vma->vm_pgoff +
+ ((vmstart - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol, vma->vm_userfaultfd_ctx);
+ if (prev) {
+ vma = prev;
+ goto replace;
+ }
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ if (err)
+ goto out;
+ }
+ if (vma->vm_end != vmend) {
+ err = split_vma(vma->vm_mm, vma, vmend, 0);
+ if (err)
+ goto out;
+ }
+ replace:
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
+ }
+
+ out:
+ return err;
+}
+
+/* Set the process memory policy */
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
+{
+ struct mempolicy *new, *old;
+ NODEMASK_SCRATCH(scratch);
+ int ret;
+
+ if (!scratch)
+ return -ENOMEM;
+
+ new = mpol_new(mode, flags, nodes);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto out;
+ }
+
+ ret = mpol_set_nodemask(new, nodes, scratch);
+ if (ret) {
+ mpol_put(new);
+ goto out;
+ }
+ task_lock(current);
+ old = current->mempolicy;
+ current->mempolicy = new;
+ if (new && new->mode == MPOL_INTERLEAVE)
+ current->il_prev = MAX_NUMNODES-1;
+ task_unlock(current);
+ mpol_put(old);
+ ret = 0;
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+ return ret;
+}
+
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ *
+ * Called with task's alloc_lock held
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
+{
+ nodes_clear(*nodes);
+ if (p == &default_policy)
+ return;
+
+ switch (p->mode) {
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ *nodes = p->v.nodes;
+ break;
+ case MPOL_PREFERRED:
+ if (!(p->flags & MPOL_F_LOCAL))
+ node_set(p->v.preferred_node, *nodes);
+ /* else return empty node mask for local allocation */
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *p = NULL;
+ int err;
+
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
+ if (err > 0) {
+ err = page_to_nid(p);
+ put_page(p);
+ }
+ if (locked)
+ mmap_read_unlock(mm);
+ return err;
+}
+
+/* Retrieve NUMA policy */
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
+{
+ int err;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
+
+ if (flags &
+ ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
+ return -EINVAL;
+
+ if (flags & MPOL_F_MEMS_ALLOWED) {
+ if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+ return -EINVAL;
+ *policy = 0; /* just so it's initialized */
+ task_lock(current);
+ *nmask = cpuset_current_mems_allowed;
+ task_unlock(current);
+ return 0;
+ }
+
+ if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
+ mmap_read_lock(mm);
+ vma = find_vma_intersection(mm, addr, addr+1);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ return -EFAULT;
+ }
+ if (vma->vm_ops && vma->vm_ops->get_policy)
+ pol = vma->vm_ops->get_policy(vma, addr);
+ else
+ pol = vma->vm_policy;
+ } else if (addr)
+ return -EINVAL;
+
+ if (!pol)
+ pol = &default_policy; /* indicates default behavior */
+
+ if (flags & MPOL_F_NODE) {
+ if (flags & MPOL_F_ADDR) {
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_lock, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
+ if (err < 0)
+ goto out;
+ *policy = err;
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_INTERLEAVE) {
+ *policy = next_node_in(current->il_prev, pol->v.nodes);
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
+ }
+
+ err = 0;
+ if (nmask) {
+ if (mpol_store_user_nodemask(pol)) {
+ *nmask = pol->w.user_nodemask;
+ } else {
+ task_lock(current);
+ get_policy_nodemask(pol, nmask);
+ task_unlock(current);
+ }
+ }
+
+ out:
+ mpol_cond_put(pol);
+ if (vma)
+ mmap_read_unlock(mm);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
+ return err;
+}
+
+#ifdef CONFIG_MIGRATION
+/*
+ * page migration, thp tail pages can be passed.
+ */
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ struct page *head = compound_head(page);
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
+ } else if (flags & MPOL_MF_STRICT) {
+ /*
+ * Non-movable page may reach here. And, there may be
+ * temporary off LRU pages or non-LRU movable pages.
+ * Treat them as unmovable pages since they can't be
+ * isolated, so they can't be moved at the moment. It
+ * should return -EIO for this case too.
+ */
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
+{
+ nodemask_t nmask;
+ LIST_HEAD(pagelist);
+ int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
+
+ nodes_clear(nmask);
+ node_set(source, nmask);
+
+ /*
+ * This does not "check" the range but isolates all pages that
+ * need migration. Between passing in the full user address
+ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ */
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+ queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(&pagelist);
+ }
+
+ return err;
+}
+
+/*
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
+{
+ int busy = 0;
+ int err;
+ nodemask_t tmp;
+
+ err = migrate_prep();
+ if (err)
+ return err;
+
+ mmap_read_lock(mm);
+
+ /*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scanning from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
+
+ tmp = *from;
+ while (!nodes_empty(tmp)) {
+ int s,d;
+ int source = NUMA_NO_NODE;
+ int dest = 0;
+
+ for_each_node_mask(s, tmp) {
+
+ /*
+ * do_migrate_pages() tries to maintain the relative
+ * node relationship of the pages established between
+ * threads and memory areas.
+ *
+ * However if the number of source nodes is not equal to
+ * the number of destination nodes we can not preserve
+ * this node relative relationship. In that case, skip
+ * copying memory from a node that is in the destination
+ * mask.
+ *
+ * Example: [2,3,4] -> [3,4,5] moves everything.
+ * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+ */
+
+ if ((nodes_weight(*from) != nodes_weight(*to)) &&
+ (node_isset(s, *to)))
+ continue;
+
+ d = node_remap(s, *from, *to);
+ if (s == d)
+ continue;
+
+ source = s; /* Node moved. Memorize */
+ dest = d;
+
+ /* dest not in remaining from nodes? */
+ if (!node_isset(dest, tmp))
+ break;
+ }
+ if (source == NUMA_NO_NODE)
+ break;
+
+ node_clear(source, tmp);
+ err = migrate_to_node(mm, source, dest, flags);
+ if (err > 0)
+ busy += err;
+ if (err < 0)
+ break;
+ }
+ mmap_read_unlock(mm);
+ if (err < 0)
+ return err;
+ return busy;
+
+}
+
+/*
+ * Allocate a new page for page migration based on vma policy.
+ * Start by assuming the page is mapped by the same vma as contains @start.
+ * Search forward from there, if not. N.B., this assumes that the
+ * list of pages handed to migrate_pages()--which is how we get here--
+ * is in virtual address order.
+ */
+static struct page *new_page(struct page *page, unsigned long start)
+{
+ struct vm_area_struct *vma;
+ unsigned long address;
+
+ vma = find_vma(current->mm, start);
+ while (vma) {
+ address = page_address_in_vma(page, vma);
+ if (address != -EFAULT)
+ break;
+ vma = vma->vm_next;
+ }
+
+ if (PageHuge(page)) {
+ return alloc_huge_page_vma(page_hstate(compound_head(page)),
+ vma, address);
+ } else if (PageTransHuge(page)) {
+ struct page *thp;
+
+ thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+ HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
+ vma, address);
+}
+#else
+
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ return -EIO;
+}
+
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
+{
+ return -ENOSYS;
+}
+
+static struct page *new_page(struct page *page, unsigned long start)
+{
+ return NULL;
+}
+#endif
+
+static long do_mbind(unsigned long start, unsigned long len,
+ unsigned short mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ int ret;
+ LIST_HEAD(pagelist);
+
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ new = mpol_new(mode, mode_flags, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
+
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ goto mpol_out;
+ }
+ {
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ mmap_write_lock(mm);
+ err = mpol_set_nodemask(new, nmask, scratch);
+ if (err)
+ mmap_write_unlock(mm);
+ } else
+ err = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ }
+ if (err)
+ goto mpol_out;
+
+ ret = queue_pages_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ if (ret < 0) {
+ err = ret;
+ goto up_out;
+ }
+
+ err = mbind_range(mm, start, end, new);
+
+ if (!err) {
+ int nr_failed = 0;
+
+ if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
+ nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ if (nr_failed)
+ putback_movable_pages(&pagelist);
+ }
+
+ if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
+ err = -EIO;
+ } else {
+up_out:
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+ }
+
+ mmap_write_unlock(mm);
+mpol_out:
+ mpol_put(new);
+ return err;
+}
+
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long t;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /*
+ * When the user specified more nodes than supported just check
+ * if the non supported part is all zero.
+ *
+ * If maxnode have more longs than MAX_NUMNODES, check
+ * the bits in that area first. And then go through to
+ * check the rest bits which equal or bigger than MAX_NUMNODES.
+ * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+ */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
+ unsigned long valid_mask = endmask;
+
+ valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+ if (get_user(t, nmask + nlongs - 1))
+ return -EFAULT;
+ if (t & valid_mask)
+ return -EINVAL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+static long kernel_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, const unsigned long __user *nmask,
+ unsigned long maxnode, unsigned int flags)
+{
+ nodemask_t nodes;
+ int err;
+ unsigned short mode_flags;
+
+ start = untagged_addr(start);
+ mode_flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if (mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((mode_flags & MPOL_F_STATIC_NODES) &&
+ (mode_flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+}
+
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned int, flags)
+{
+ return kernel_mbind(start, len, mode, nmask, maxnode, flags);
+}
+
+/* Set the process memory policy */
+static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ int err;
+ nodemask_t nodes;
+ unsigned short flags;
+
+ flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, flags, &nodes);
+}
+
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode)
+{
+ return kernel_set_mempolicy(mode, nmask, maxnode);
+}
+
+static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
+ const unsigned long __user *old_nodes,
+ const unsigned long __user *new_nodes)
+{
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ nodemask_t task_nodes;
+ int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ return -ENOMEM;
+
+ old = &scratch->mask1;
+ new = &scratch->mask2;
+
+ err = get_nodes(old, old_nodes, maxnode);
+ if (err)
+ goto out;
+
+ err = get_nodes(new, new_nodes, maxnode);
+ if (err)
+ goto out;
+
+ /* Find the mm_struct */
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+
+ err = -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified process.
+ * Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ err = -EPERM;
+ goto out_put;
+ }
+ rcu_read_unlock();
+
+ task_nodes = cpuset_mems_allowed(task);
+ /* Is the user allowed to access the target nodes? */
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out_put;
+ }
+
+ task_nodes = cpuset_mems_allowed(current);
+ nodes_and(*new, *new, task_nodes);
+ if (nodes_empty(*new))
+ goto out_put;
+
+ err = security_task_movememory(task);
+ if (err)
+ goto out_put;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = do_migrate_pages(mm, old, new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+
+ mmput(mm);
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+
+ return err;
+
+out_put:
+ put_task_struct(task);
+ goto out;
+
+}
+
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
+{
+ return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
+}
+
+
+/* Retrieve NUMA policy */
+static int kernel_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr,
+ unsigned long flags)
+{
+ int err;
+ int pval;
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < nr_node_ids)
+ return -EINVAL;
+
+ addr = untagged_addr(addr);
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);
+
+ if (err)
+ return err;
+
+ if (policy && put_user(pval, policy))
+ return -EFAULT;
+
+ if (nmask)
+ err = copy_nodes_to_user(nmask, maxnode, &nodes);
+
+ return err;
+}
+
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
+{
+ return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
+}
+
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode,
+ compat_ulong_t, addr, compat_ulong_t, flags)
+{
+ long err;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask)
+ nm = compat_alloc_user_space(alloc_size);
+
+ err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+
+ if (!err && nmask) {
+ unsigned long copy_size;
+ copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+ err = copy_from_user(bm, nm, copy_size);
+ /* ensure entire bitmap is zeroed */
+ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
+ err |= compat_put_bitmap(nmask, bm, nr_bits);
+ }
+
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode)
+{
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ if (compat_get_bitmap(bm, nmask, nr_bits))
+ return -EFAULT;
+ nm = compat_alloc_user_space(alloc_size);
+ if (copy_to_user(nm, bm, alloc_size))
+ return -EFAULT;
+ }
+
+ return kernel_set_mempolicy(mode, nm, nr_bits+1);
+}
+
+COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
+ compat_ulong_t, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode, compat_ulong_t, flags)
+{
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ nodemask_t bm;
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
+ return -EFAULT;
+ nm = compat_alloc_user_space(alloc_size);
+ if (copy_to_user(nm, nodes_addr(bm), alloc_size))
+ return -EFAULT;
+ }
+
+ return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
+}
+
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
+{
+ unsigned long __user *old = NULL;
+ unsigned long __user *new = NULL;
+ nodemask_t tmp_mask;
+ unsigned long nr_bits;
+ unsigned long size;
+
+ nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+ size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+ if (old_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+ return -EFAULT;
+ old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+ if (new_nodes)
+ new = old + size / sizeof(unsigned long);
+ if (copy_to_user(old, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ if (new_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+ return -EFAULT;
+ if (new == NULL)
+ new = compat_alloc_user_space(size);
+ if (copy_to_user(new, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ return kernel_migrate_pages(pid, nr_bits + 1, old, new);
+}
+
+#endif /* CONFIG_COMPAT */
+
+bool vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ return false;
+
+ /*
+ * DAX device mappings require predictable access latency, so avoid
+ * incurring periodic faults.
+ */
+ if (vma_is_dax(vma))
+ return false;
+
+ if (is_vm_hugetlb_page(vma) &&
+ !hugepage_migration_supported(hstate_vma(vma)))
+ return false;
+
+ /*
+ * Migration allocates pages in the highest zone. If we cannot
+ * do so then migration (at least from node to node) is not
+ * possible.
+ */
+ if (vma->vm_file &&
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+ < policy_zone)
+ return false;
+ return true;
+}
+
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct mempolicy *pol = NULL;
+
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ pol = vma->vm_ops->get_policy(vma, addr);
+ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
+ }
+
+ return pol;
+}
+
+/*
+ * get_vma_policy(@vma, @addr)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to current->mempolicy or system default policy, as necessary.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct mempolicy *pol = __get_vma_policy(vma, addr);
+
+ if (!pol)
+ pol = get_task_policy(current);
+
+ return pol;
+}
+
+bool vma_policy_mof(struct vm_area_struct *vma)
+{
+ struct mempolicy *pol;
+
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
+ }
+
+ pol = vma->vm_policy;
+ if (!pol)
+ pol = get_task_policy(current);
+
+ return pol->flags & MPOL_F_MOF;
+}
+
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+{
+ enum zone_type dynamic_policy_zone = policy_zone;
+
+ BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+
+ /*
+ * if policy->v.nodes has movable memory only,
+ * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+ *
+ * policy->v.nodes is intersect with node_states[N_MEMORY].
+ * so if the following test faile, it implies
+ * policy->v.nodes has movable memory only.
+ */
+ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ dynamic_policy_zone = ZONE_MOVABLE;
+
+ return zone >= dynamic_policy_zone;
+}
+
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+{
+ /* Lower zones don't get a nodemask applied for MPOL_BIND */
+ if (unlikely(policy->mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+ return &policy->v.nodes;
+
+ return NULL;
+}
+
+/* Return the node id preferred by the given mempolicy, or the given id */
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
+{
+ if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
+ else {
+ /*
+ * __GFP_THISNODE shouldn't even be used with the bind policy
+ * because we might easily break the expectation to stay on the
+ * requested node and not break the policy.
+ */
+ WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
+ }
+
+ return nd;
+}
+
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy)
+{
+ unsigned next;
+ struct task_struct *me = current;
+
+ next = next_node_in(me->il_prev, policy->v.nodes);
+ if (next < MAX_NUMNODES)
+ me->il_prev = next;
+ return next;
+}
+
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned int mempolicy_slab_node(void)
+{
+ struct mempolicy *policy;
+ int node = numa_mem_id();
+
+ if (in_interrupt())
+ return node;
+
+ policy = current->mempolicy;
+ if (!policy || policy->flags & MPOL_F_LOCAL)
+ return node;
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * handled MPOL_F_LOCAL above
+ */
+ return policy->v.preferred_node;
+
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND: {
+ struct zoneref *z;
+
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ struct zonelist *zonelist;
+ enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+ zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
+ z = first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes);
+ return z->zone ? zone_to_nid(z->zone) : node;
+ }
+
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Do static interleaving for a VMA with known offset @n. Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
+static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
+{
+ unsigned nnodes = nodes_weight(pol->v.nodes);
+ unsigned target;
+ int i;
+ int nid;
+
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)n % nnodes;
+ nid = first_node(pol->v.nodes);
+ for (i = 0; i < target; i++)
+ nid = next_node(nid, pol->v.nodes);
+ return nid;
+}
+
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+ struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+ if (vma) {
+ unsigned long off;
+
+ /*
+ * for small pages, there is no difference between
+ * shift and PAGE_SHIFT, so the bit-shift is safe.
+ * for huge pages, since vm_pgoff is in units of small
+ * pages, we need to shift off the always 0 bits to get
+ * a useful offset.
+ */
+ BUG_ON(shift < PAGE_SHIFT);
+ off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
+ off += (addr - vma->vm_start) >> shift;
+ return offset_il_node(pol, off);
+ } else
+ return interleave_nodes(pol);
+}
+
+#ifdef CONFIG_HUGETLBFS
+/*
+ * huge_node(@vma, @addr, @gfp_flags, @mpol)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags: for requested zone
+ * @mpol: pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ *
+ * Returns a nid suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
+ *
+ * Must be protected by read_mems_allowed_begin()
+ */
+int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
+ struct mempolicy **mpol, nodemask_t **nodemask)
+{
+ int nid;
+
+ *mpol = get_vma_policy(vma, addr);
+ *nodemask = NULL; /* assume !MPOL_BIND */
+
+ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ nid = interleave_nid(*mpol, vma, addr,
+ huge_page_shift(hstate_vma(vma)));
+ } else {
+ nid = policy_node(gfp_flags, *mpol, numa_node_id());
+ if ((*mpol)->mode == MPOL_BIND)
+ *nodemask = &(*mpol)->v.nodes;
+ }
+ return nid;
+}
+
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy. Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ int nid;
+
+ if (!(mask && current->mempolicy))
+ return false;
+
+ task_lock(current);
+ mempolicy = current->mempolicy;
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ if (mempolicy->flags & MPOL_F_LOCAL)
+ nid = numa_node_id();
+ else
+ nid = mempolicy->v.preferred_node;
+ init_nodemask_of_node(mask, nid);
+ break;
+
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ *mask = mempolicy->v.nodes;
+ break;
+
+ default:
+ BUG();
+ }
+ task_unlock(current);
+
+ return true;
+}
+#endif
+
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
+/* Allocate a page in interleaved policy.
+ Own path because it needs to do special accounting. */
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+ unsigned nid)
+{
+ struct page *page;
+
+ page = __alloc_pages(gfp, order, nid);
+ /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return page;
+ if (page && page_to_nid(page) == nid) {
+ preempt_disable();
+ __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+ preempt_enable();
+ }
+ return page;
+}
+
+/**
+ * alloc_pages_vma - Allocate a page for a VMA.
+ *
+ * @gfp:
+ * %GFP_USER user allocation.
+ * %GFP_KERNEL kernel allocations,
+ * %GFP_HIGHMEM highmem/user allocations,
+ * %GFP_FS allocation should not call back into a file system.
+ * %GFP_ATOMIC don't sleep.
+ *
+ * @order:Order of the GFP allocation.
+ * @vma: Pointer to VMA or NULL if not available.
+ * @addr: Virtual Address of the allocation. Must be inside the VMA.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: for hugepages try only the preferred node if possible
+ *
+ * This function allocates a page from the kernel page pool and applies
+ * a NUMA policy associated with the VMA or the current process.
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
+ * mm_struct of the VMA to prevent it from going away. Should be used for
+ * all allocations for pages that will be mapped into user space. Returns
+ * NULL when no page can be allocated.
+ */
+struct page *
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, int node, bool hugepage)
+{
+ struct mempolicy *pol;
+ struct page *page;
+ int preferred_nid;
+ nodemask_t *nmask;
+
+ pol = get_vma_policy(vma, addr);
+
+ if (pol->mode == MPOL_INTERLEAVE) {
+ unsigned nid;
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+ goto out;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+ int hpage_node = node;
+
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node (or other explicitly preferred
+ * node) we only try to allocate from the current/preferred
+ * node and don't fall back to other nodes, as the cost of
+ * remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+ hpage_node = pol->v.preferred_node;
+
+ nmask = policy_nodemask(gfp, pol);
+ if (!nmask || node_isset(hpage_node, *nmask)) {
+ mpol_cond_put(pol);
+ /*
+ * First, try to allocate THP only on local node, but
+ * don't reclaim unnecessarily, just compact.
+ */
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+
+ /*
+ * If hugepage allocations are configured to always
+ * synchronous compact or the vma has been madvised
+ * to prefer hugepage backing, retry allowing remote
+ * memory with both reclaim and compact as well.
+ */
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+ page = __alloc_pages_nodemask(gfp, order,
+ hpage_node, nmask);
+
+ goto out;
+ }
+ }
+
+ nmask = policy_nodemask(gfp, pol);
+ preferred_nid = policy_node(gfp, pol, node);
+ page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+ mpol_cond_put(pol);
+out:
+ return page;
+}
+EXPORT_SYMBOL(alloc_pages_vma);
+
+/**
+ * alloc_pages_current - Allocate pages.
+ *
+ * @gfp:
+ * %GFP_USER user allocation,
+ * %GFP_KERNEL kernel allocation,
+ * %GFP_HIGHMEM highmem allocation,
+ * %GFP_FS don't call back into a file system.
+ * %GFP_ATOMIC don't sleep.
+ * @order: Power of two of allocation size in pages. 0 is a single page.
+ *
+ * Allocate a page from the kernel page pool. When not in
+ * interrupt context and apply the current process NUMA policy.
+ * Returns NULL when no page can be allocated.
+ */
+struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
+ struct page *page;
+
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (pol->mode == MPOL_INTERLEAVE)
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
+ policy_node(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
+
+ return page;
+}
+EXPORT_SYMBOL(alloc_pages_current);
+
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+ struct mempolicy *pol = mpol_dup(vma_policy(src));
+
+ if (IS_ERR(pol))
+ return PTR_ERR(pol);
+ dst->vm_policy = pol;
+ return 0;
+}
+
+/*
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed(). This
+ * keeps mempolicies cpuset relative after its cpuset moves. See
+ * further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
+ */
+
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
+{
+ struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ /* task's mempolicy is protected by alloc_lock */
+ if (old == current->mempolicy) {
+ task_lock(current);
+ *new = *old;
+ task_unlock(current);
+ } else
+ *new = *old;
+
+ if (current_cpuset_is_being_rebound()) {
+ nodemask_t mems = cpuset_mems_allowed(current);
+ mpol_rebind_policy(new, &mems);
+ }
+ atomic_set(&new->refcnt, 1);
+ return new;
+}
+
+/* Slow path of a mempolicy comparison */
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+ if (!a || !b)
+ return false;
+ if (a->mode != b->mode)
+ return false;
+ if (a->flags != b->flags)
+ return false;
+ if (mpol_store_user_nodemask(a))
+ if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+ return false;
+
+ switch (a->mode) {
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ return !!nodes_equal(a->v.nodes, b->v.nodes);
+ case MPOL_PREFERRED:
+ /* a's ->flags is the same as b's */
+ if (a->flags & MPOL_F_LOCAL)
+ return true;
+ return a->v.preferred_node == b->v.preferred_node;
+ default:
+ BUG();
+ return false;
+ }
+}
+
+/*
+ * Shared memory backing store policy support.
+ *
+ * Remember policies even when nobody has shared memory mapped.
+ * The policies are kept in Red-Black tree linked from the inode.
+ * They are protected by the sp->lock rwlock, which should be held
+ * for any accesses to the tree.
+ */
+
+/*
+ * lookup first element intersecting start-end. Caller holds sp->lock for
+ * reading or for writing
+ */
+static struct sp_node *
+sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+{
+ struct rb_node *n = sp->root.rb_node;
+
+ while (n) {
+ struct sp_node *p = rb_entry(n, struct sp_node, nd);
+
+ if (start >= p->end)
+ n = n->rb_right;
+ else if (end <= p->start)
+ n = n->rb_left;
+ else
+ break;
+ }
+ if (!n)
+ return NULL;
+ for (;;) {
+ struct sp_node *w = NULL;
+ struct rb_node *prev = rb_prev(n);
+ if (!prev)
+ break;
+ w = rb_entry(prev, struct sp_node, nd);
+ if (w->end <= start)
+ break;
+ n = prev;
+ }
+ return rb_entry(n, struct sp_node, nd);
+}
+
+/*
+ * Insert a new shared policy into the list. Caller holds sp->lock for
+ * writing.
+ */
+static void sp_insert(struct shared_policy *sp, struct sp_node *new)
+{
+ struct rb_node **p = &sp->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sp_node *nd;
+
+ while (*p) {
+ parent = *p;
+ nd = rb_entry(parent, struct sp_node, nd);
+ if (new->start < nd->start)
+ p = &(*p)->rb_left;
+ else if (new->end > nd->end)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+ rb_link_node(&new->nd, parent, p);
+ rb_insert_color(&new->nd, &sp->root);
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
+ new->policy ? new->policy->mode : 0);
+}
+
+/* Find shared policy intersecting idx */
+struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+ struct mempolicy *pol = NULL;
+ struct sp_node *sn;
+
+ if (!sp->root.rb_node)
+ return NULL;
+ read_lock(&sp->lock);
+ sn = sp_lookup(sp, idx, idx+1);
+ if (sn) {
+ mpol_get(sn->policy);
+ pol = sn->policy;
+ }
+ read_unlock(&sp->lock);
+ return pol;
+}
+
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page: page to be checked
+ * @vma: vm area where page mapped
+ * @addr: virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zoneref *z;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = cpu_to_node(thiscpu);
+ int polnid = NUMA_NO_NODE;
+ int ret = -1;
+
+ pol = get_vma_policy(vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ z = first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes);
+ polnid = zone_to_nid(z->zone);
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ polnid = thisnid;
+
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
+/*
+ * Drop the (possibly final) reference to task->mempolicy. It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+ struct mempolicy *pol;
+
+ task_lock(task);
+ pol = task->mempolicy;
+ task->mempolicy = NULL;
+ task_unlock(task);
+ mpol_put(pol);
+}
+
+static void sp_delete(struct shared_policy *sp, struct sp_node *n)
+{
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
+ rb_erase(&n->nd, &sp->root);
+ sp_free(n);
+}
+
+static void sp_node_init(struct sp_node *node, unsigned long start,
+ unsigned long end, struct mempolicy *pol)
+{
+ node->start = start;
+ node->end = end;
+ node->policy = pol;
+}
+
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+ struct mempolicy *pol)
+{
+ struct sp_node *n;
+ struct mempolicy *newpol;
+
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n)
+ return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+ sp_node_init(n, start, end, newpol);
+
+ return n;
+}
+
+/* Replace a policy range. */
+static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
+ unsigned long end, struct sp_node *new)
+{
+ struct sp_node *n;
+ struct sp_node *n_new = NULL;
+ struct mempolicy *mpol_new = NULL;
+ int ret = 0;
+
+restart:
+ write_lock(&sp->lock);
+ n = sp_lookup(sp, start, end);
+ /* Take care of old policies in the same range. */
+ while (n && n->start < end) {
+ struct rb_node *next = rb_next(&n->nd);
+ if (n->start >= start) {
+ if (n->end <= end)
+ sp_delete(sp, n);
+ else
+ n->start = end;
+ } else {
+ /* Old policy spanning whole new range. */
+ if (n->end > end) {
+ if (!n_new)
+ goto alloc_new;
+
+ *mpol_new = *n->policy;
+ atomic_set(&mpol_new->refcnt, 1);
+ sp_node_init(n_new, end, n->end, mpol_new);
+ n->end = start;
+ sp_insert(sp, n_new);
+ n_new = NULL;
+ mpol_new = NULL;
+ break;
+ } else
+ n->end = start;
+ }
+ if (!next)
+ break;
+ n = rb_entry(next, struct sp_node, nd);
+ }
+ if (new)
+ sp_insert(sp, new);
+ write_unlock(&sp->lock);
+ ret = 0;
+
+err_out:
+ if (mpol_new)
+ mpol_put(mpol_new);
+ if (n_new)
+ kmem_cache_free(sn_cache, n_new);
+
+ return ret;
+
+alloc_new:
+ write_unlock(&sp->lock);
+ ret = -ENOMEM;
+ n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n_new)
+ goto err_out;
+ mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!mpol_new)
+ goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
+ goto restart;
+}
+
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol: struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+ int ret;
+
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+ rwlock_init(&sp->lock);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+ struct mempolicy *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ goto put_mpol;
+ /* contextualize the tmpfs mount point mempolicy */
+ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ if (IS_ERR(new))
+ goto free_scratch; /* no valid nodemask intersection */
+
+ task_lock(current);
+ ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+ task_unlock(current);
+ if (ret)
+ goto put_new;
+
+ /* Create pseudo-vma that contains just the policy */
+ vma_init(&pvma, NULL);
+ pvma.vm_end = TASK_SIZE; /* policy covers entire file */
+ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_new:
+ mpol_put(new); /* drop initial ref */
+free_scratch:
+ NODEMASK_SCRATCH_FREE(scratch);
+put_mpol:
+ mpol_put(mpol); /* drop our incoming ref on sb mpol */
+ }
+}
+
+int mpol_set_shared_policy(struct shared_policy *info,
+ struct vm_area_struct *vma, struct mempolicy *npol)
+{
+ int err;
+ struct sp_node *new = NULL;
+ unsigned long sz = vma_pages(vma);
+
+ pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
+ vma->vm_pgoff,
+ sz, npol ? npol->mode : -1,
+ npol ? npol->flags : -1,
+ npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+
+ if (npol) {
+ new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+ if (!new)
+ return -ENOMEM;
+ }
+ err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+ if (err && new)
+ sp_free(new);
+ return err;
+}
+
+/* Free a backing policy store on inode delete. */
+void mpol_free_shared_policy(struct shared_policy *p)
+{
+ struct sp_node *n;
+ struct rb_node *next;
+
+ if (!p->root.rb_node)
+ return;
+ write_lock(&p->lock);
+ next = rb_first(&p->root);
+ while (next) {
+ n = rb_entry(next, struct sp_node, nd);
+ next = rb_next(&n->nd);
+ sp_delete(p, n);
+ }
+ write_unlock(&p->lock);
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+static int __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+ bool numabalancing_default = false;
+
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
+
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
+
+ if (num_online_nodes() > 1 && !numabalancing_override) {
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
+ numabalancing_default ? "Enabling" : "Disabling");
+ set_numabalancing_state(numabalancing_default);
+ }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ numabalancing_override = 1;
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ numabalancing_override = -1;
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse numa_balancing=\n");
+
+ return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
+{
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
+
+ policy_cache = kmem_cache_create("numa_policy",
+ sizeof(struct mempolicy),
+ 0, SLAB_PANIC, NULL);
+
+ sn_cache = kmem_cache_create("shared_policy_node",
+ sizeof(struct sp_node),
+ 0, SLAB_PANIC, NULL);
+
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
+
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
+
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
+
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
+ pr_err("%s: interleaving failed\n", __func__);
+
+ check_numabalancing_enable();
+}
+
+/* Reset policy of current process to default */
+void numa_default_policy(void)
+{
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
+}
+
+/*
+ * Parse and format mempolicy from/to strings
+ */
+
+/*
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
+ */
+static const char * const policy_modes[] =
+{
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "prefer",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local",
+};
+
+
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
+ * @str: string containing mempolicy to parse
+ * @mpol: pointer to struct mempolicy pointer, returned on success.
+ *
+ * Format of input:
+ * <mode>[=<flags>][:<nodelist>]
+ *
+ * On success, returns 0, else 1
+ */
+int mpol_parse_str(char *str, struct mempolicy **mpol)
+{
+ struct mempolicy *new = NULL;
+ unsigned short mode_flags;
+ nodemask_t nodes;
+ char *nodelist = strchr(str, ':');
+ char *flags = strchr(str, '=');
+ int err = 1, mode;
+
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
+ if (nodelist) {
+ /* NUL-terminate mode or flags string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, nodes))
+ goto out;
+ if (!nodes_subset(nodes, node_states[N_MEMORY]))
+ goto out;
+ } else
+ nodes_clear(nodes);
+
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
+ goto out;
+
+ switch (mode) {
+ case MPOL_PREFERRED:
+ /*
+ * Insist on a nodelist of one node only, although later
+ * we use first_node(nodes) to grab a single node, so here
+ * nodelist (or nodes) cannot be empty.
+ */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (*rest)
+ goto out;
+ if (nodes_empty(nodes))
+ goto out;
+ }
+ break;
+ case MPOL_INTERLEAVE:
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
+ if (!nodelist)
+ nodes = node_states[N_MEMORY];
+ break;
+ case MPOL_LOCAL:
+ /*
+ * Don't allow a nodelist; mpol_new() checks flags
+ */
+ if (nodelist)
+ goto out;
+ mode = MPOL_PREFERRED;
+ break;
+ case MPOL_DEFAULT:
+ /*
+ * Insist on a empty nodelist
+ */
+ if (!nodelist)
+ err = 0;
+ goto out;
+ case MPOL_BIND:
+ /*
+ * Insist on a nodelist
+ */
+ if (!nodelist)
+ goto out;
+ }
+
+ mode_flags = 0;
+ if (flags) {
+ /*
+ * Currently, we only support two mutually exclusive
+ * mode flags.
+ */
+ if (!strcmp(flags, "static"))
+ mode_flags |= MPOL_F_STATIC_NODES;
+ else if (!strcmp(flags, "relative"))
+ mode_flags |= MPOL_F_RELATIVE_NODES;
+ else
+ goto out;
+ }
+
+ new = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR(new))
+ goto out;
+
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
+
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
+
+ err = 0;
+
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ if (flags)
+ *--flags = '=';
+ if (!err)
+ *mpol = new;
+ return err;
+}
+#endif /* CONFIG_TMPFS */
+
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer: to contain formatted mempolicy string
+ * @maxlen: length of @buffer
+ * @pol: pointer to mempolicy to be formatted
+ *
+ * Convert @pol into a string. If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
+ */
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+ char *p = buffer;
+ nodemask_t nodes = NODE_MASK_NONE;
+ unsigned short mode = MPOL_DEFAULT;
+ unsigned short flags = 0;
+
+ if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
+ mode = pol->mode;
+ flags = pol->flags;
+ }
+
+ switch (mode) {
+ case MPOL_DEFAULT:
+ break;
+ case MPOL_PREFERRED:
+ if (flags & MPOL_F_LOCAL)
+ mode = MPOL_LOCAL;
+ else
+ node_set(pol->v.preferred_node, nodes);
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ nodes = pol->v.nodes;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ snprintf(p, maxlen, "unknown");
+ return;
+ }
+
+ p += snprintf(p, maxlen, "%s", policy_modes[mode]);
+
+ if (flags & MPOL_MODE_FLAGS) {
+ p += snprintf(p, buffer + maxlen - p, "=");
+
+ /*
+ * Currently, the only defined flags are mutually exclusive
+ */
+ if (flags & MPOL_F_STATIC_NODES)
+ p += snprintf(p, buffer + maxlen - p, "static");
+ else if (flags & MPOL_F_RELATIVE_NODES)
+ p += snprintf(p, buffer + maxlen - p, "relative");
+ }
+
+ if (!nodes_empty(nodes))
+ p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
+ nodemask_pr_args(&nodes));
+}
diff --git a/mm/mempool.c b/mm/mempool.c
new file mode 100644
index 000000000..f473cddda
--- /dev/null
+++ b/mm/mempool.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/mempool.c
+ *
+ * memory buffer pool support. Such pools are mostly used
+ * for guaranteed, deadlock-free memory allocations during
+ * extreme VM load.
+ *
+ * started by Ingo Molnar, Copyright (C) 2001
+ * debugging by David Rientjes, Copyright (C) 2015
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
+#include <linux/export.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include "slab.h"
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+ size_t byte)
+{
+ const int nr = pool->curr_nr;
+ const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+ const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+ int i;
+
+ pr_err("BUG: mempool element poison mismatch\n");
+ pr_err("Mempool %p size %zu\n", pool, size);
+ pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+ for (i = start; i < end; i++)
+ pr_cont("%x ", *(u8 *)(element + i));
+ pr_cont("%s\n", end < size ? "..." : "");
+ dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+ u8 *obj = element;
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+ if (obj[i] != exp) {
+ poison_error(pool, element, size, i);
+ return;
+ }
+ }
+ memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
+ __check_element(pool, element, ksize(element));
+ } else if (pool->free == mempool_free_pages) {
+ /* Mempools backed by page allocator */
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+
+static void __poison_element(void *element, size_t size)
+{
+ u8 *obj = element;
+
+ memset(obj, POISON_FREE, size - 1);
+ obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
+ __poison_element(element, ksize(element));
+ } else if (pool->alloc == mempool_alloc_pages) {
+ /* Mempools backed by page allocator */
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __poison_element(addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ kasan_poison_kfree(element, _RET_IP_);
+ else if (pool->alloc == mempool_alloc_pages)
+ kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ kasan_unpoison_slab(element);
+ else if (pool->alloc == mempool_alloc_pages)
+ kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
+
+static __always_inline void add_element(mempool_t *pool, void *element)
+{
+ BUG_ON(pool->curr_nr >= pool->min_nr);
+ poison_element(pool, element);
+ kasan_poison_element(pool, element);
+ pool->elements[pool->curr_nr++] = element;
+}
+
+static void *remove_element(mempool_t *pool)
+{
+ void *element = pool->elements[--pool->curr_nr];
+
+ BUG_ON(pool->curr_nr < 0);
+ kasan_unpoison_element(pool, element);
+ check_element(pool, element);
+ return element;
+}
+
+/**
+ * mempool_exit - exit a mempool initialized with mempool_init()
+ * @pool: pointer to the memory pool which was initialized with
+ * mempool_init().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ *
+ * May be called on a zeroed but uninitialized mempool (i.e. allocated with
+ * kzalloc()).
+ */
+void mempool_exit(mempool_t *pool)
+{
+ while (pool->curr_nr) {
+ void *element = remove_element(pool);
+ pool->free(element, pool->pool_data);
+ }
+ kfree(pool->elements);
+ pool->elements = NULL;
+}
+EXPORT_SYMBOL(mempool_exit);
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ */
+void mempool_destroy(mempool_t *pool)
+{
+ if (unlikely(!pool))
+ return;
+
+ mempool_exit(pool);
+ kfree(pool);
+}
+EXPORT_SYMBOL(mempool_destroy);
+
+int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
+{
+ spin_lock_init(&pool->lock);
+ pool->min_nr = min_nr;
+ pool->pool_data = pool_data;
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+ init_waitqueue_head(&pool->wait);
+
+ pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
+ gfp_mask, node_id);
+ if (!pool->elements)
+ return -ENOMEM;
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+ */
+ while (pool->curr_nr < pool->min_nr) {
+ void *element;
+
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (unlikely(!element)) {
+ mempool_exit(pool);
+ return -ENOMEM;
+ }
+ add_element(pool, element);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(mempool_init_node);
+
+/**
+ * mempool_init - initialize a memory pool
+ * @pool: pointer to the memory pool that should be initialized
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * Like mempool_create(), but initializes the pool in (i.e. embedded in another
+ * structure).
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
+ pool_data, GFP_KERNEL, NUMA_NO_NODE);
+
+}
+EXPORT_SYMBOL(mempool_init);
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc() function is not called
+ * from IRQ contexts.
+ *
+ * Return: pointer to the created memory pool object or %NULL on error.
+ */
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+ GFP_KERNEL, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(mempool_create);
+
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
+{
+ mempool_t *pool;
+
+ pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
+ if (!pool)
+ return NULL;
+
+ if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
+ gfp_mask, node_id)) {
+ kfree(pool);
+ return NULL;
+ }
+
+ return pool;
+}
+EXPORT_SYMBOL(mempool_create_node);
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ * allocated for this pool.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int mempool_resize(mempool_t *pool, int new_min_nr)
+{
+ void *element;
+ void **new_elements;
+ unsigned long flags;
+
+ BUG_ON(new_min_nr <= 0);
+ might_sleep();
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (new_min_nr <= pool->min_nr) {
+ while (new_min_nr < pool->curr_nr) {
+ element = remove_element(pool);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ pool->free(element, pool->pool_data);
+ spin_lock_irqsave(&pool->lock, flags);
+ }
+ pool->min_nr = new_min_nr;
+ goto out_unlock;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /* Grow the pool */
+ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+ GFP_KERNEL);
+ if (!new_elements)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (unlikely(new_min_nr <= pool->min_nr)) {
+ /* Raced, other resize will do our work */
+ spin_unlock_irqrestore(&pool->lock, flags);
+ kfree(new_elements);
+ goto out;
+ }
+ memcpy(new_elements, pool->elements,
+ pool->curr_nr * sizeof(*new_elements));
+ kfree(pool->elements);
+ pool->elements = new_elements;
+ pool->min_nr = new_min_nr;
+
+ while (pool->curr_nr < pool->min_nr) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
+ if (!element)
+ goto out;
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->curr_nr < pool->min_nr) {
+ add_element(pool, element);
+ } else {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ pool->free(element, pool->pool_data); /* Raced */
+ goto out;
+ }
+ }
+out_unlock:
+ spin_unlock_irqrestore(&pool->lock, flags);
+out:
+ return 0;
+}
+EXPORT_SYMBOL(mempool_resize);
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @gfp_mask: the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn() function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
+ *
+ * Return: pointer to the allocated element or %NULL on error.
+ */
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+{
+ void *element;
+ unsigned long flags;
+ wait_queue_entry_t wait;
+ gfp_t gfp_temp;
+
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+
+ gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
+ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
+ gfp_mask |= __GFP_NOWARN; /* failures are OK */
+
+ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
+
+repeat_alloc:
+
+ element = pool->alloc(gfp_temp, pool->pool_data);
+ if (likely(element != NULL))
+ return element;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr)) {
+ element = remove_element(pool);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ /* paired with rmb in mempool_free(), read comment there */
+ smp_wmb();
+ /*
+ * Update the allocation stack trace as this is more useful
+ * for debugging.
+ */
+ kmemleak_update_trace(element);
+ return element;
+ }
+
+ /*
+ * We use gfp mask w/o direct reclaim or IO for the first round. If
+ * alloc failed with that and @pool was empty, retry immediately.
+ */
+ if (gfp_temp != gfp_mask) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ gfp_temp = gfp_mask;
+ goto repeat_alloc;
+ }
+
+ /* We must not sleep if !__GFP_DIRECT_RECLAIM */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return NULL;
+ }
+
+ /* Let's wait for someone else to return an element to @pool */
+ init_wait(&wait);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /*
+ * FIXME: this should be io_schedule(). The timeout is there as a
+ * workaround for some DM problems in 2.6.18.
+ */
+ io_schedule_timeout(5*HZ);
+
+ finish_wait(&pool->wait, &wait);
+ goto repeat_alloc;
+}
+EXPORT_SYMBOL(mempool_alloc);
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element: pool element pointer.
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+ unsigned long flags;
+
+ if (unlikely(element == NULL))
+ return;
+
+ /*
+ * Paired with the wmb in mempool_alloc(). The preceding read is
+ * for @element and the following @pool->curr_nr. This ensures
+ * that the visible value of @pool->curr_nr is from after the
+ * allocation of @element. This is necessary for fringe cases
+ * where @element was passed to this task without going through
+ * barriers.
+ *
+ * For example, assume @p is %NULL at the beginning and one task
+ * performs "p = mempool_alloc(...);" while another task is doing
+ * "while (!p) cpu_relax(); mempool_free(p, ...);". This function
+ * may end up using curr_nr value which is from before allocation
+ * of @p without the following rmb.
+ */
+ smp_rmb();
+
+ /*
+ * For correctness, we need a test which is guaranteed to trigger
+ * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
+ * without locking achieves that and refilling as soon as possible
+ * is desirable.
+ *
+ * Because curr_nr visible here is always a value after the
+ * allocation of @element, any task which decremented curr_nr below
+ * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+ * incremented to min_nr afterwards. If curr_nr gets incremented
+ * to min_nr after the allocation of @element, the elements
+ * allocated after that are subject to the same guarantee.
+ *
+ * Waiters happen iff curr_nr is 0 and the above guarantee also
+ * ensures that there will be frees which return elements to the
+ * pool waking up the waiters.
+ */
+ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr < pool->min_nr)) {
+ add_element(pool, element);
+ spin_unlock_irqrestore(&pool->lock, flags);
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+ pool->free(element, pool->pool_data);
+}
+EXPORT_SYMBOL(mempool_free);
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
+{
+ struct kmem_cache *mem = pool_data;
+ VM_BUG_ON(mem->ctor);
+ return kmem_cache_alloc(mem, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_alloc_slab);
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+ struct kmem_cache *mem = pool_data;
+ kmem_cache_free(mem, element);
+}
+EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+ kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+ int order = (int)(long)pool_data;
+ __free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
diff --git a/mm/memremap.c b/mm/memremap.c
new file mode 100644
index 000000000..299aad0d2
--- /dev/null
+++ b/mm/memremap.c
@@ -0,0 +1,532 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/kasan.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/pfn_t.h>
+#include <linux/swap.h>
+#include <linux/mmzone.h>
+#include <linux/swapops.h>
+#include <linux/types.h>
+#include <linux/wait_bit.h>
+#include <linux/xarray.h>
+
+static DEFINE_XARRAY(pgmap_array);
+
+/*
+ * The memremap() and memremap_pages() interfaces are alternately used
+ * to map persistent memory namespaces. These interfaces place different
+ * constraints on the alignment and size of the mapping (namespace).
+ * memremap() can map individual PAGE_SIZE pages. memremap_pages() can
+ * only map subsections (2MB), and at least one architecture (PowerPC)
+ * the minimum mapping granularity of memremap_pages() is 16MB.
+ *
+ * The role of memremap_compat_align() is to communicate the minimum
+ * arch supported alignment of a namespace such that it can freely
+ * switch modes without violating the arch constraint. Namely, do not
+ * allow a namespace to be PAGE_SIZE aligned since that namespace may be
+ * reconfigured into a mode that requires SUBSECTION_SIZE alignment.
+ */
+#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
+unsigned long memremap_compat_align(void)
+{
+ return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
+{
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_dec(&devmap_managed_key);
+}
+
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_FS_DAX)
+ static_branch_inc(&devmap_managed_key);
+}
+#else
+static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+}
+static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
+{
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+static void pgmap_array_delete(struct range *range)
+{
+ xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
+ NULL, GFP_KERNEL);
+ synchronize_rcu();
+}
+
+static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ unsigned long pfn = PHYS_PFN(range->start);
+
+ if (range_id)
+ return pfn;
+ return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
+}
+
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < pgmap->nr_range; i++) {
+ struct range *range = &pgmap->ranges[i];
+
+ if (pfn >= PHYS_PFN(range->start) &&
+ pfn <= PHYS_PFN(range->end))
+ return pfn >= pfn_first(pgmap, i);
+ }
+
+ return false;
+}
+
+static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
+{
+ const struct range *range = &pgmap->ranges[range_id];
+
+ return (range->start + range_len(range)) >> PAGE_SHIFT;
+}
+
+static unsigned long pfn_next(unsigned long pfn)
+{
+ if (pfn % 1024 == 0)
+ cond_resched();
+ return pfn + 1;
+}
+
+#define for_each_device_pfn(pfn, map, i) \
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->kill)
+ pgmap->ops->kill(pgmap);
+ else
+ percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->cleanup) {
+ pgmap->ops->cleanup(pgmap);
+ } else {
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(pgmap->ref);
+ }
+ /*
+ * Undo the pgmap ref assignment for the internal case as the
+ * caller may re-enable the same pgmap.
+ */
+ if (pgmap->ref == &pgmap->internal_ref)
+ pgmap->ref = NULL;
+}
+
+static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ struct page *first_page;
+ int nid;
+
+ /* make sure to access a memmap that was actually initialized */
+ first_page = pfn_to_page(pfn_first(pgmap, range_id));
+
+ /* pages are dead and unused, undo the arch mapping */
+ nid = page_to_nid(first_page);
+
+ mem_hotplug_begin();
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)));
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ __remove_pages(PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), NULL);
+ } else {
+ arch_remove_memory(nid, range->start, range_len(range),
+ pgmap_altmap(pgmap));
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
+ }
+ mem_hotplug_done();
+
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ pgmap_array_delete(range);
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ unsigned long pfn;
+ int i;
+
+ dev_pagemap_kill(pgmap);
+ for (i = 0; i < pgmap->nr_range; i++)
+ for_each_device_pfn(pfn, pgmap, i)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ for (i = 0; i < pgmap->nr_range; i++)
+ pageunmap_range(pgmap, i);
+
+ WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
+ devmap_managed_enable_put(pgmap);
+}
+EXPORT_SYMBOL_GPL(memunmap_pages);
+
+static void devm_memremap_pages_release(void *data)
+{
+ memunmap_pages(data);
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+ struct dev_pagemap *pgmap =
+ container_of(ref, struct dev_pagemap, internal_ref);
+
+ complete(&pgmap->done);
+}
+
+static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
+ int range_id, int nid)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ struct dev_pagemap *conflict_pgmap;
+ int error, is_ram;
+
+ if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0,
+ "altmap not supported for multiple ranges\n"))
+ return -EINVAL;
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ is_ram = region_intersects(range->start, range_len(range),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n",
+ is_ram == REGION_MIXED ? "mixed" : "ram",
+ range->start, range->end);
+ return -ENXIO;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start),
+ PHYS_PFN(range->end), pgmap, GFP_KERNEL));
+ if (error)
+ return error;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
+ range_len(range));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params);
+ } else {
+ error = kasan_add_zero_shadow(__va(range->start), range_len(range));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, range->start, range_len(range),
+ params);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+ - pfn_first(pgmap, range_id));
+ return 0;
+
+err_add_memory:
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
+err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+err_pfn_remap:
+ pgmap_array_delete(range);
+ return error;
+}
+
+
+/*
+ * Not device managed version of dev_memremap_pages, undone by
+ * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * device available.
+ */
+void *memremap_pages(struct dev_pagemap *pgmap, int nid)
+{
+ struct mhp_params params = {
+ .altmap = pgmap_altmap(pgmap),
+ .pgprot = PAGE_KERNEL,
+ };
+ const int nr_range = pgmap->nr_range;
+ int error, i;
+
+ if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
+ return ERR_PTR(-EINVAL);
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+ WARN(1, "Device private memory not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+ WARN(1, "Missing migrate_to_ram method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_FS_DAX:
+ if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+ IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ WARN(1, "File system DAX not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ params.pgprot = pgprot_decrypted(params.pgprot);
+ break;
+ case MEMORY_DEVICE_GENERIC:
+ break;
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ params.pgprot = pgprot_noncached(params.pgprot);
+ break;
+ default:
+ WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+ break;
+ }
+
+ if (!pgmap->ref) {
+ if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+ return ERR_PTR(-EINVAL);
+
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->internal_ref,
+ dev_pagemap_percpu_release, 0, GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
+ pgmap->ref = &pgmap->internal_ref;
+ } else {
+ if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ devmap_managed_enable_get(pgmap);
+
+ /*
+ * Clear the pgmap nr_range as it will be incremented for each
+ * successfully processed range. This communicates how many
+ * regions to unwind in the abort case.
+ */
+ pgmap->nr_range = 0;
+ error = 0;
+ for (i = 0; i < nr_range; i++) {
+ error = pagemap_range(pgmap, &params, i, nid);
+ if (error)
+ break;
+ pgmap->nr_range++;
+ }
+
+ if (i < nr_range) {
+ memunmap_pages(pgmap);
+ pgmap->nr_range = nr_range;
+ return ERR_PTR(error);
+ }
+
+ return __va(pgmap->ranges[0].start);
+}
+EXPORT_SYMBOL_GPL(memremap_pages);
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @pgmap: pointer to a struct dev_pagemap
+ *
+ * Notes:
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ * by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case
+ * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
+ *
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ * 'live' on entry and will be killed and reaped at
+ * devm_memremap_pages_release() time, or if this routine fails.
+ *
+ * 4/ range is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ int error;
+ void *ret;
+
+ ret = memremap_pages(pgmap, dev_to_node(dev));
+ if (IS_ERR(ret))
+ return ret;
+
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
+
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+ struct dev_pagemap *pgmap)
+{
+ resource_size_t phys = PFN_PHYS(pfn);
+
+ /*
+ * In the cached case we're already holding a live reference.
+ */
+ if (pgmap) {
+ if (phys >= pgmap->range.start && phys <= pgmap->range.end)
+ return pgmap;
+ put_dev_pagemap(pgmap);
+ }
+
+ /* fall back to slow path lookup */
+ rcu_read_lock();
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
+ if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ pgmap = NULL;
+ rcu_read_unlock();
+
+ return pgmap;
+}
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void free_devmap_managed_page(struct page *page)
+{
+ /* notify page idle for dax */
+ if (!is_device_private_page(page)) {
+ wake_up_var(&page->_refcount);
+ return;
+ }
+
+ __ClearPageWaiters(page);
+
+ mem_cgroup_uncharge(page);
+
+ /*
+ * When a device_private page is freed, the page->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of page->mapping may still identify the page as an
+ * anonymous page. Ultimately, this entire field is just stale
+ * and wrong, and it will cause errors if not cleared. One
+ * example is:
+ *
+ * migrate_vma_pages()
+ * migrate_vma_insert_page()
+ * page_add_new_anon_rmap()
+ * __page_set_anon_rmap()
+ * ...checks page->mapping, via PageAnon(page) call,
+ * and incorrectly concludes that the page is an
+ * anonymous page. Therefore, it incorrectly,
+ * silently fails to set up the new anon rmap.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear page->mapping.
+ */
+ page->mapping = NULL;
+ page->pgmap->ops->page_free(page);
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000..f53ace709
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
+ 0,
+ 0xffffffffffffffffULL,
+ 0x5555555555555555ULL,
+ 0xaaaaaaaaaaaaaaaaULL,
+ 0x1111111111111111ULL,
+ 0x2222222222222222ULL,
+ 0x4444444444444444ULL,
+ 0x8888888888888888ULL,
+ 0x3333333333333333ULL,
+ 0x6666666666666666ULL,
+ 0x9999999999999999ULL,
+ 0xccccccccccccccccULL,
+ 0x7777777777777777ULL,
+ 0xbbbbbbbbbbbbbbbbULL,
+ 0xddddddddddddddddULL,
+ 0xeeeeeeeeeeeeeeeeULL,
+ 0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
+{
+ pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
+ cpu_to_be64(pattern), &start_bad, &end_bad);
+ memblock_reserve(start_bad, end_bad - start_bad);
+}
+
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
+{
+ u64 *p, *start, *end;
+ phys_addr_t start_bad, last_bad;
+ phys_addr_t start_phys_aligned;
+ const size_t incr = sizeof(pattern);
+
+ start_phys_aligned = ALIGN(start_phys, incr);
+ start = __va(start_phys_aligned);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
+ start_bad = 0;
+ last_bad = 0;
+
+ for (p = start; p < end; p++)
+ *p = pattern;
+
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
+ continue;
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ continue;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+ start_bad = last_bad = start_phys_aligned;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
+{
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+ &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+ if (this_start < this_end) {
+ pr_info(" %pa - %pa pattern %016llx\n",
+ &this_start, &this_end, cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
+ }
+}
+
+/* default is disabled */
+static unsigned int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+ int ret = 0;
+
+ if (arg)
+ ret = kstrtouint(arg, 0, &memtest_pattern);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
+ return ret;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
+{
+ unsigned int i;
+ unsigned int idx = 0;
+
+ if (!memtest_pattern)
+ return;
+
+ pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+ idx = i % ARRAY_SIZE(patterns);
+ do_one_pass(patterns[idx], start, end);
+ }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000..fcb7eb6a6
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,3122 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Migration functionality - linux/mm/migrate.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter
+ */
+
+#include <linux/migrate.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/mm_inline.h>
+#include <linux/nsproxy.h>
+#include <linux/pagevec.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
+#include <linux/backing-dev.h>
+#include <linux/compaction.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/gfp.h>
+#include <linux/pagewalk.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/balloon_compaction.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/page_owner.h>
+#include <linux/sched/mm.h>
+#include <linux/ptrace.h>
+#include <linux/oom.h>
+
+#include <asm/tlbflush.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
+
+#include "internal.h"
+
+/*
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
+ * undesirable, use migrate_prep_local()
+ */
+int migrate_prep(void)
+{
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
+ lru_add_drain_all();
+
+ return 0;
+}
+
+/* Do the necessary work of migrate_prep but not if it involves other CPUs */
+int migrate_prep_local(void)
+{
+ lru_add_drain();
+
+ return 0;
+}
+
+int isolate_movable_page(struct page *page, isolate_mode_t mode)
+{
+ struct address_space *mapping;
+
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a movable page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto out;
+
+ /*
+ * Check PageMovable before holding a PG_lock because page's owner
+ * assumes anybody doesn't touch PG_lock of newly allocated page
+ * so unconditionally grabbing the lock ruins page's owner side.
+ */
+ if (unlikely(!__PageMovable(page)))
+ goto out_putpage;
+ /*
+ * As movable pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the releasing a page.
+ *
+ * In order to avoid having an already isolated movable page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released,
+ * lets be sure we have the page lock
+ * before proceeding with the movable page isolation steps.
+ */
+ if (unlikely(!trylock_page(page)))
+ goto out_putpage;
+
+ if (!PageMovable(page) || PageIsolated(page))
+ goto out_no_isolated;
+
+ mapping = page_mapping(page);
+ VM_BUG_ON_PAGE(!mapping, page);
+
+ if (!mapping->a_ops->isolate_page(page, mode))
+ goto out_no_isolated;
+
+ /* Driver shouldn't use PG_isolated bit of page->flags */
+ WARN_ON_ONCE(PageIsolated(page));
+ __SetPageIsolated(page);
+ unlock_page(page);
+
+ return 0;
+
+out_no_isolated:
+ unlock_page(page);
+out_putpage:
+ put_page(page);
+out:
+ return -EBUSY;
+}
+
+/* It should be called on page which is PG_movable */
+void putback_movable_page(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ mapping = page_mapping(page);
+ mapping->a_ops->putback_page(page);
+ __ClearPageIsolated(page);
+}
+
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used whenever the isolated pageset has been
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_hugetlb().
+ */
+void putback_movable_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ if (unlikely(PageHuge(page))) {
+ putback_active_hugepage(page);
+ continue;
+ }
+ list_del(&page->lru);
+ /*
+ * We isolated non-lru movable page so here we can use
+ * __PageMovable because LRU page's mapping cannot have
+ * PAGE_MAPPING_MOVABLE.
+ */
+ if (unlikely(__PageMovable(page))) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ } else {
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_lru(page), -thp_nr_pages(page));
+ putback_lru_page(page);
+ }
+ }
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, void *old)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = old,
+ .vma = vma,
+ .address = addr,
+ .flags = PVMW_SYNC | PVMW_MIGRATION,
+ };
+ struct page *new;
+ pte_t pte;
+ swp_entry_t entry;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ while (page_vma_mapped_walk(&pvmw)) {
+ if (PageKsm(page))
+ new = page;
+ else
+ new = page - pvmw.page->index +
+ linear_page_index(vma, pvmw.address);
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ remove_migration_pmd(&pvmw, new);
+ continue;
+ }
+#endif
+
+ get_page(new);
+ pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_mksoft_dirty(pte);
+
+ /*
+ * Recheck VMA as permissions can change since migration started
+ */
+ entry = pte_to_swp_entry(*pvmw.pte);
+ if (is_write_migration_entry(entry))
+ pte = maybe_mkwrite(pte, vma);
+ else if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
+
+ if (unlikely(is_device_private_page(new))) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ }
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new)) {
+ pte = pte_mkhuge(pte);
+ pte = arch_make_huge_pte(pte, vma, new, 0);
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, pvmw.address);
+ else
+ page_dup_rmap(new, true);
+ } else
+#endif
+ {
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+
+ if (PageAnon(new))
+ page_add_anon_rmap(new, vma, pvmw.address, false);
+ else
+ page_add_file_rmap(new, false);
+ }
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
+ mlock_vma_page(new);
+
+ if (PageTransHuge(page) && PageMlocked(page))
+ clear_page_mlock(page);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, pvmw.address, pvmw.pte);
+ }
+
+ return true;
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+void remove_migration_ptes(struct page *old, struct page *new, bool locked)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = remove_migration_pte,
+ .arg = old,
+ };
+
+ if (locked)
+ rmap_walk_locked(new, &rwc);
+ else
+ rmap_walk(new, &rwc);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ */
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl)
+{
+ pte_t pte;
+ swp_entry_t entry;
+ struct page *page;
+
+ spin_lock(ptl);
+ pte = *ptep;
+ if (!is_swap_pte(pte))
+ goto out;
+
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto out;
+
+ page = migration_entry_to_page(entry);
+ page = compound_head(page);
+
+ /*
+ * Once page cache replacement of page migration started, page_count
+ * is zero; but we must not call put_and_wait_on_page_locked() without
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
+ */
+ if (!get_page_unless_zero(page))
+ goto out;
+ pte_unmap_unlock(ptep, ptl);
+ put_and_wait_on_page_locked(page);
+ return;
+out:
+ pte_unmap_unlock(ptep, ptl);
+}
+
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ pte_t *ptep = pte_offset_map(pmd, address);
+ __migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct vm_area_struct *vma,
+ struct mm_struct *mm, pte_t *pte)
+{
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+ __migration_entry_wait(mm, pte, ptl);
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!is_pmd_migration_entry(*pmd))
+ goto unlock;
+ page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ if (!get_page_unless_zero(page))
+ goto unlock;
+ spin_unlock(ptl);
+ put_and_wait_on_page_locked(page);
+ return;
+unlock:
+ spin_unlock(ptl);
+}
+#endif
+
+static int expected_page_refs(struct address_space *mapping, struct page *page)
+{
+ int expected_count = 1;
+
+ /*
+ * Device private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ if (mapping)
+ expected_count += thp_nr_pages(page) + page_has_private(page);
+
+ return expected_count;
+}
+
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
+ */
+int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page, int extra_count)
+{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
+ struct zone *oldzone, *newzone;
+ int dirty;
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int nr = thp_nr_pages(page);
+
+ if (!mapping) {
+ /* Anonymous page without mapping */
+ if (page_count(page) != expected_count)
+ return -EAGAIN;
+
+ /* No turning back from here */
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ __SetPageSwapBacked(newpage);
+
+ return MIGRATEPAGE_SUCCESS;
+ }
+
+ oldzone = page_zone(page);
+ newzone = page_zone(newpage);
+
+ xas_lock_irq(&xas);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
+ return -EAGAIN;
+ }
+
+ if (!page_ref_freeze(page, expected_count)) {
+ xas_unlock_irq(&xas);
+ return -EAGAIN;
+ }
+
+ /*
+ * Now we know that no one else is looking at the page:
+ * no turning back from here.
+ */
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ page_ref_add(newpage, nr); /* add cache reference */
+ if (PageSwapBacked(page)) {
+ __SetPageSwapBacked(newpage);
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+ } else {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ }
+
+ /* Move dirty while page refs frozen and newpage not yet exposed */
+ dirty = PageDirty(page);
+ if (dirty) {
+ ClearPageDirty(page);
+ SetPageDirty(newpage);
+ }
+
+ xas_store(&xas, newpage);
+ if (PageTransHuge(page)) {
+ int i;
+
+ for (i = 1; i < nr; i++) {
+ xas_next(&xas);
+ xas_store(&xas, newpage);
+ }
+ }
+
+ /*
+ * Drop cache reference from old page by unfreezing
+ * to one less reference.
+ * We know this isn't the last reference.
+ */
+ page_ref_unfreeze(page, expected_count - nr);
+
+ xas_unlock(&xas);
+ /* Leave irq disabled to prevent preemption while updating stats */
+
+ /*
+ * If moved to a different zone then also account
+ * the page for that zone. Other VM counters will be
+ * taken care of when we establish references to the
+ * new page and drop references to the old page.
+ *
+ * Note that anonymous pages are accounted for
+ * via NR_FILE_PAGES and NR_ANON_MAPPED if they
+ * are mapped to swap space.
+ */
+ if (newzone != oldzone) {
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
+ if (PageSwapBacked(page) && !PageSwapCache(page)) {
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+ }
+ if (dirty && mapping_can_writeback(mapping)) {
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
+ }
+ }
+ local_irq_enable();
+
+ return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL(migrate_page_move_mapping);
+
+/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
+ int expected_count;
+
+ xas_lock_irq(&xas);
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
+ return -EAGAIN;
+ }
+
+ if (!page_ref_freeze(page, expected_count)) {
+ xas_unlock_irq(&xas);
+ return -EAGAIN;
+ }
+
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+
+ get_page(newpage);
+
+ xas_store(&xas, newpage);
+
+ page_ref_unfreeze(page, expected_count - 1);
+
+ xas_unlock_irq(&xas);
+
+ return MIGRATEPAGE_SUCCESS;
+}
+
+/*
+ * Gigantic pages are so large that we do not guarantee that page++ pointer
+ * arithmetic will work across the entire page. We need something more
+ * specialized.
+ */
+static void __copy_gigantic_page(struct page *dst, struct page *src,
+ int nr_pages)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < nr_pages; ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+static void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ int nr_pages;
+
+ if (PageHuge(src)) {
+ /* hugetlbfs page */
+ struct hstate *h = page_hstate(src);
+ nr_pages = pages_per_huge_page(h);
+
+ if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+ __copy_gigantic_page(dst, src, nr_pages);
+ return;
+ }
+ } else {
+ /* thp page */
+ BUG_ON(!PageTransHuge(src));
+ nr_pages = thp_nr_pages(src);
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_states(struct page *newpage, struct page *page)
+{
+ int cpupid;
+
+ if (PageError(page))
+ SetPageError(newpage);
+ if (PageReferenced(page))
+ SetPageReferenced(newpage);
+ if (PageUptodate(page))
+ SetPageUptodate(newpage);
+ if (TestClearPageActive(page)) {
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ SetPageActive(newpage);
+ } else if (TestClearPageUnevictable(page))
+ SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
+ if (PageChecked(page))
+ SetPageChecked(newpage);
+ if (PageMappedToDisk(page))
+ SetPageMappedToDisk(newpage);
+
+ /* Move dirty on pages not done by migrate_page_move_mapping() */
+ if (PageDirty(page))
+ SetPageDirty(newpage);
+
+ if (page_is_young(page))
+ set_page_young(newpage);
+ if (page_is_idle(page))
+ set_page_idle(newpage);
+
+ /*
+ * Copy NUMA information to the new page, to prevent over-eager
+ * future migrations of this same page.
+ */
+ cpupid = page_cpupid_xchg_last(page, -1);
+ page_cpupid_xchg_last(newpage, cpupid);
+
+ ksm_migrate_page(newpage, page);
+ /*
+ * Please do not reorder this without considering how mm/ksm.c's
+ * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+ */
+ if (PageSwapCache(page))
+ ClearPageSwapCache(page);
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+
+ /*
+ * If any waiters have accumulated on the new page then
+ * wake them up.
+ */
+ if (PageWriteback(newpage))
+ end_page_writeback(newpage);
+
+ /*
+ * PG_readahead shares the same bit with PG_reclaim. The above
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
+ * bit after that.
+ */
+ if (PageReadahead(page))
+ SetPageReadahead(newpage);
+
+ copy_page_owner(page, newpage);
+
+ if (!PageHuge(page))
+ mem_cgroup_migrate(page, newpage);
+}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
+
+ migrate_page_states(newpage, page);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/************************************************************
+ * Migration functions
+ ***********************************************************/
+
+/*
+ * Common logic to directly migrate a single LRU page suitable for
+ * pages that do not use PagePrivate/PagePrivate2.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ int rc;
+
+ BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL(migrate_page);
+
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+
+static int __buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode,
+ bool check_refs)
+{
+ struct buffer_head *bh, *head;
+ int rc;
+ int expected_count;
+
+ if (!page_has_buffers(page))
+ return migrate_page(mapping, newpage, page, mode);
+
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = expected_page_refs(mapping, page);
+ if (page_count(page) != expected_count)
+ return -EAGAIN;
+
+ head = page_buffers(page);
+ if (!buffer_migrate_lock_buffers(head, mode))
+ return -EAGAIN;
+
+ if (check_refs) {
+ bool busy;
+ bool invalidated = false;
+
+recheck_buffers:
+ busy = false;
+ spin_lock(&mapping->private_lock);
+ bh = head;
+ do {
+ if (atomic_read(&bh->b_count)) {
+ busy = true;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ if (busy) {
+ if (invalidated) {
+ rc = -EAGAIN;
+ goto unlock_buffers;
+ }
+ spin_unlock(&mapping->private_lock);
+ invalidate_bh_lrus();
+ invalidated = true;
+ goto recheck_buffers;
+ }
+ }
+
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ goto unlock_buffers;
+
+ attach_page_private(newpage, detach_page_private(page));
+
+ bh = head;
+ do {
+ set_bh_page(bh, newpage, bh_offset(bh));
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
+
+ rc = MIGRATEPAGE_SUCCESS;
+unlock_buffers:
+ if (check_refs)
+ spin_unlock(&mapping->private_lock);
+ bh = head;
+ do {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return rc;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Same as above except that this variant is more careful and checks that there
+ * are also no buffer head references. This function is the right one for
+ * mappings where buffer heads are directly looked up and referenced (such as
+ * block device mappings).
+ */
+int buffer_migrate_page_norefs(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
+}
+#endif
+
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .for_reclaim = 1
+ };
+ int rc;
+
+ if (!mapping->a_ops->writepage)
+ /* No write method for the address space */
+ return -EINVAL;
+
+ if (!clear_page_dirty_for_io(page))
+ /* Someone else already triggered a write */
+ return -EAGAIN;
+
+ /*
+ * A dirty page may imply that the underlying filesystem has
+ * the page on some queue. So the page must be clean for
+ * migration. Writeout may mean we loose the lock and the
+ * page state is no longer what we checked for earlier.
+ * At this point we know that the migration attempt cannot
+ * be successful.
+ */
+ remove_migration_ptes(page, page, false);
+
+ rc = mapping->a_ops->writepage(page, &wbc);
+
+ if (rc != AOP_WRITEPAGE_ACTIVATE)
+ /* unlocked. Relock */
+ lock_page(page);
+
+ return (rc < 0) ? -EIO : -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
+ return -EBUSY;
+ }
+ return writeout(mapping, page);
+ }
+
+ /*
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
+
+ return migrate_page(mapping, newpage, page, mode);
+}
+
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ *
+ * Return value:
+ * < 0 - error code
+ * MIGRATEPAGE_SUCCESS - success
+ */
+static int move_to_new_page(struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ struct address_space *mapping;
+ int rc = -EAGAIN;
+ bool is_lru = !__PageMovable(page);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+
+ mapping = page_mapping(page);
+
+ if (likely(is_lru)) {
+ if (!mapping)
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * provide a migratepage callback. Anonymous pages
+ * are part of swap space which also has its own
+ * migratepage callback. This is the most common path
+ * for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage,
+ page, mode);
+ } else {
+ /*
+ * In case of non-lru page, it could be released after
+ * isolation step. In that case, we shouldn't try migration.
+ */
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (!PageMovable(page)) {
+ rc = MIGRATEPAGE_SUCCESS;
+ __ClearPageIsolated(page);
+ goto out;
+ }
+
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
+ !PageIsolated(page));
+ }
+
+ /*
+ * When successful, old pagecache page->mapping must be cleared before
+ * page is freed; but stats require that PageAnon be left as PageAnon.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ if (__PageMovable(page)) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ /*
+ * We clear PG_movable under page_lock so any compactor
+ * cannot try to migrate this page.
+ */
+ __ClearPageIsolated(page);
+ }
+
+ /*
+ * Anonymous and movable page->mapping will be cleared by
+ * free_pages_prepare so don't reset it here for keeping
+ * the type to work PageAnon, for example.
+ */
+ if (!PageMappingFlags(page))
+ page->mapping = NULL;
+
+ if (likely(!is_zone_device_page(newpage))) {
+ int i, nr = compound_nr(newpage);
+
+ for (i = 0; i < nr; i++)
+ flush_dcache_page(newpage + i);
+ }
+ }
+out:
+ return rc;
+}
+
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, enum migrate_mode mode)
+{
+ int rc = -EAGAIN;
+ int page_was_mapped = 0;
+ struct anon_vma *anon_vma = NULL;
+ bool is_lru = !__PageMovable(page);
+
+ if (!trylock_page(page)) {
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readahead). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto out;
+
+ lock_page(page);
+ }
+
+ if (PageWriteback(page)) {
+ /*
+ * Only in the case of a full synchronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
+ */
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+ if (!force)
+ goto out_unlock;
+ wait_on_page_writeback(page);
+ }
+
+ /*
+ * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrates a page.
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
+ * of migration. File cache pages are no problem because of page_lock()
+ * File Caches may use write_page() or lock_page() in migration, then,
+ * just care Anon page here.
+ *
+ * Only page_get_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ * But if we cannot get anon_vma, then we won't need it anyway,
+ * because that implies that the anon page is no longer mapped
+ * (and cannot be remapped so long as we hold the page lock).
+ */
+ if (PageAnon(page) && !PageKsm(page))
+ anon_vma = page_get_anon_vma(page);
+
+ /*
+ * Block others from accessing the new page when we get around to
+ * establishing additional references. We are usually the only one
+ * holding a reference to newpage at this point. We used to have a BUG
+ * here if trylock_page(newpage) fails, but would like to allow for
+ * cases where there might be a race with the previous use of newpage.
+ * This is much like races on refcount of oldpage: just don't BUG().
+ */
+ if (unlikely(!trylock_page(newpage)))
+ goto out_unlock;
+
+ if (unlikely(!is_lru)) {
+ rc = move_to_new_page(newpage, page, mode);
+ goto out_unlock_both;
+ }
+
+ /*
+ * Corner case handling:
+ * 1. When a new swap-cache page is read into, it is added to the LRU
+ * and treated as swapcache but it has no rmap yet.
+ * Calling try_to_unmap() against a page->mapping==NULL page will
+ * trigger a BUG. So handle it here.
+ * 2. An orphaned page (see truncate_complete_page) might have
+ * fs-private metadata. The page can be picked up due to memory
+ * offlining. Everywhere else except page reclaim, the page is
+ * invisible to the vm, so the page can not be migrated. So try to
+ * free the metadata, so the page can be freed.
+ */
+ if (!page->mapping) {
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ if (page_has_private(page)) {
+ try_to_free_buffers(page);
+ goto out_unlock_both;
+ }
+ } else if (page_mapped(page)) {
+ /* Establish migration ptes */
+ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+ page);
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
+ page_was_mapped = 1;
+ }
+
+ if (!page_mapped(page))
+ rc = move_to_new_page(newpage, page, mode);
+
+ if (page_was_mapped)
+ remove_migration_ptes(page,
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
+
+out_unlock_both:
+ unlock_page(newpage);
+out_unlock:
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+ unlock_page(page);
+out:
+ /*
+ * If migration is successful, decrease refcount of the newpage
+ * which will not free the page because new page owner increased
+ * refcounter. As well, if it is LRU page, add the page to LRU
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ if (unlikely(!is_lru))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+
+ return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page,
+ free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode,
+ enum migrate_reason reason)
+{
+ int rc = MIGRATEPAGE_SUCCESS;
+ struct page *newpage = NULL;
+
+ if (!thp_migration_supported() && PageTransHuge(page))
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ if (unlikely(__PageMovable(page))) {
+ lock_page(page);
+ if (!PageMovable(page))
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ }
+ goto out;
+ }
+
+ newpage = get_new_page(page, private);
+ if (!newpage)
+ return -ENOMEM;
+
+ rc = __unmap_and_move(page, newpage, force, mode);
+ if (rc == MIGRATEPAGE_SUCCESS)
+ set_page_owner_migrate_reason(newpage, reason);
+
+out:
+ if (rc != -EAGAIN) {
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kept its references and be restored.
+ */
+ list_del(&page->lru);
+
+ /*
+ * Compaction can migrate also non-LRU pages which are
+ * not accounted to NR_ISOLATED_*. They can be recognized
+ * as __PageMovable
+ */
+ if (likely(!__PageMovable(page)))
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_lru(page), -thp_nr_pages(page));
+ }
+
+ /*
+ * If migration is successful, releases reference grabbed during
+ * isolation. Otherwise, restore the page to right list unless
+ * we want to retry.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ if (reason != MR_MEMORY_FAILURE)
+ /*
+ * We release the page in page_handle_poison.
+ */
+ put_page(page);
+ } else {
+ if (rc != -EAGAIN) {
+ if (likely(!__PageMovable(page))) {
+ putback_lru_page(page);
+ goto put_new;
+ }
+
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ }
+put_new:
+ if (put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
+ }
+
+ return rc;
+}
+
+/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ struct page *hpage, int force,
+ enum migrate_mode mode, int reason)
+{
+ int rc = -EAGAIN;
+ int page_was_mapped = 0;
+ struct page *new_hpage;
+ struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
+
+ /*
+ * Migratability of hugepages depends on architectures and their size.
+ * This check is necessary because some callers of hugepage migration
+ * like soft offline and memory hotremove don't walk through page
+ * tables or check whether the hugepage is pmd-based or not before
+ * kicking migration.
+ */
+ if (!hugepage_migration_supported(page_hstate(hpage))) {
+ putback_active_hugepage(hpage);
+ return -ENOSYS;
+ }
+
+ new_hpage = get_new_page(hpage, private);
+ if (!new_hpage)
+ return -ENOMEM;
+
+ if (!trylock_page(hpage)) {
+ if (!force)
+ goto out;
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
+ goto out;
+ }
+ lock_page(hpage);
+ }
+
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (PageAnon(hpage))
+ anon_vma = page_get_anon_vma(hpage);
+
+ if (unlikely(!trylock_page(new_hpage)))
+ goto put_anon;
+
+ if (page_mapped(hpage)) {
+ bool mapping_locked = false;
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+
+ if (!PageAnon(hpage)) {
+ /*
+ * In shared mappings, try_to_unmap could potentially
+ * call huge_pmd_unshare. Because of this, take
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
+ * to let lower levels know we have taken the lock.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (unlikely(!mapping))
+ goto unlock_put_anon;
+
+ mapping_locked = true;
+ ttu |= TTU_RMAP_LOCKED;
+ }
+
+ try_to_unmap(hpage, ttu);
+ page_was_mapped = 1;
+
+ if (mapping_locked)
+ i_mmap_unlock_write(mapping);
+ }
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, mode);
+
+ if (page_was_mapped)
+ remove_migration_ptes(hpage,
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+
+unlock_put_anon:
+ unlock_page(new_hpage);
+
+put_anon:
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ move_hugetlb_state(hpage, new_hpage, reason);
+ put_new_page = NULL;
+ }
+
+out_unlock:
+ unlock_page(hpage);
+out:
+ if (rc != -EAGAIN)
+ putback_active_hugepage(hpage);
+
+ /*
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, put_page() will drop the reference grabbed during
+ * isolation.
+ */
+ if (put_new_page)
+ put_new_page(new_hpage, private);
+ else
+ putback_active_hugepage(new_hpage);
+
+ return rc;
+}
+
+/*
+ * migrate_pages - migrate the pages specified in a list, to the free pages
+ * supplied as the target for the page migration
+ *
+ * @from: The list of pages to be migrated.
+ * @get_new_page: The function used to allocate free pages to be used
+ * as the target of the page migration.
+ * @put_new_page: The function used to free target pages if migration
+ * fails, or NULL if no special handling is necessary.
+ * @private: Private data to be passed on to get_new_page()
+ * @mode: The migration mode that specifies the constraints for
+ * page migration, if any.
+ * @reason: The reason for page migration.
+ *
+ * The function returns after 10 attempts or if no pages are movable any more
+ * because the list has become empty or no retryable pages exist any more.
+ * The caller should call putback_movable_pages() to return pages to the LRU
+ * or free list only if ret != 0.
+ *
+ * Returns the number of pages that were not migrated, or an error code.
+ */
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason)
+{
+ int retry = 1;
+ int thp_retry = 1;
+ int nr_failed = 0;
+ int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
+ int pass = 0;
+ bool is_thp = false;
+ struct page *page;
+ struct page *page2;
+ int swapwrite = current->flags & PF_SWAPWRITE;
+ int rc, nr_subpages;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
+ retry = 0;
+ thp_retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page) && !PageHuge(page);
+ nr_subpages = thp_nr_pages(page);
+ cond_resched();
+
+ if (PageHuge(page))
+ rc = unmap_and_move_huge_page(get_new_page,
+ put_new_page, private, page,
+ pass > 2, mode, reason);
+ else
+ rc = unmap_and_move(get_new_page, put_new_page,
+ private, page, pass > 2, mode,
+ reason);
+
+ switch(rc) {
+ case -ENOMEM:
+ /*
+ * THP migration might be unsupported or the
+ * allocation could've failed so we should
+ * retry on the same page with the THP split
+ * to base pages.
+ *
+ * Head page is retried immediately and tail
+ * pages are added to the tail of the list so
+ * we encounter them after the rest of the list
+ * is processed.
+ */
+ if (is_thp) {
+ lock_page(page);
+ rc = split_huge_page_to_list(page, from);
+ unlock_page(page);
+ if (!rc) {
+ list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
+ goto retry;
+ }
+
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
+ }
+ nr_failed++;
+ goto out;
+ case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
+ retry++;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
+ nr_succeeded++;
+ break;
+ default:
+ /*
+ * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * unlike -EAGAIN case, the failed page is
+ * removed from migration page list and not
+ * retried in the next outer loop.
+ */
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
+ rc = nr_failed;
+out:
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
+
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+ return rc;
+}
+
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
+#ifdef CONFIG_NUMA
+
+static int store_status(int __user *status, int start, int value, int nr)
+{
+ while (nr-- > 0) {
+ if (put_user(value, status + start))
+ return -EFAULT;
+ start++;
+ }
+
+ return 0;
+}
+
+static int do_move_pages_to_node(struct mm_struct *mm,
+ struct list_head *pagelist, int node)
+{
+ int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
+
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(pagelist);
+ return err;
+}
+
+/*
+ * Resolves the given address to a struct page, isolates it from the LRU and
+ * puts it to the given pagelist.
+ * Returns:
+ * errno - if the page cannot be found/isolated
+ * 0 - when it doesn't have to be migrated because it is already on the
+ * target node
+ * 1 - when it has been queued
+ */
+static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+ int node, struct list_head *pagelist, bool migrate_all)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned int follflags;
+ int err;
+
+ mmap_read_lock(mm);
+ err = -EFAULT;
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ goto out;
+
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ follflags = FOLL_GET | FOLL_DUMP;
+ page = follow_page(vma, addr, follflags);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto out;
+
+ err = -ENOENT;
+ if (!page)
+ goto out;
+
+ err = 0;
+ if (page_to_nid(page) == node)
+ goto out_putpage;
+
+ err = -EACCES;
+ if (page_mapcount(page) > 1 && !migrate_all)
+ goto out_putpage;
+
+ if (PageHuge(page)) {
+ if (PageHead(page)) {
+ err = isolate_hugetlb(page, pagelist);
+ if (!err)
+ err = 1;
+ }
+ } else {
+ struct page *head;
+
+ head = compound_head(page);
+ err = isolate_lru_page(head);
+ if (err)
+ goto out_putpage;
+
+ err = 1;
+ list_add_tail(&head->lru, pagelist);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
+ }
+out_putpage:
+ /*
+ * Either remove the duplicate refcount from
+ * isolate_lru_page() or drop the page ref if it was
+ * not isolated.
+ */
+ put_page(page);
+out:
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+ struct list_head *pagelist, int __user *status,
+ int start, int i, unsigned long nr_pages)
+{
+ int err;
+
+ if (list_empty(pagelist))
+ return 0;
+
+ err = do_move_pages_to_node(mm, pagelist, node);
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
+ return err;
+ }
+ return store_status(status, start, node, i - start);
+}
+
+/*
+ * Migrate an array of page address onto an array of nodes and fill
+ * the corresponding array of status.
+ */
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
+ unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ int current_node = NUMA_NO_NODE;
+ LIST_HEAD(pagelist);
+ int start, i;
+ int err = 0, err1;
+
+ migrate_prep();
+
+ for (i = start = 0; i < nr_pages; i++) {
+ const void __user *p;
+ unsigned long addr;
+ int node;
+
+ err = -EFAULT;
+ if (get_user(p, pages + i))
+ goto out_flush;
+ if (get_user(node, nodes + i))
+ goto out_flush;
+ addr = (unsigned long)untagged_addr(p);
+
+ err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_flush;
+ if (!node_state(node, N_MEMORY))
+ goto out_flush;
+
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out_flush;
+
+ if (current_node == NUMA_NO_NODE) {
+ current_node = node;
+ start = i;
+ } else if (node != current_node) {
+ err = move_pages_and_store_status(mm, current_node,
+ &pagelist, status, start, i, nr_pages);
+ if (err)
+ goto out;
+ start = i;
+ current_node = node;
+ }
+
+ /*
+ * Errors in the page lookup or isolation are not fatal and we simply
+ * report them via status
+ */
+ err = add_page_for_migration(mm, addr, current_node,
+ &pagelist, flags & MPOL_MF_MOVE_ALL);
+
+ if (err > 0) {
+ /* The page is successfully queued for migration */
+ continue;
+ }
+
+ /*
+ * If the page is already on the target node (!err), store the
+ * node, otherwise, store the err.
+ */
+ err = store_status(status, i, err ? : current_node, 1);
+ if (err)
+ goto out_flush;
+
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err)
+ goto out;
+ current_node = NUMA_NO_NODE;
+ }
+out_flush:
+ /* Make sure we do not overwrite the existing error */
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err >= 0)
+ err = err1;
+out:
+ return err;
+}
+
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user **pages, int *status)
+{
+ unsigned long i;
+
+ mmap_read_lock(mm);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long addr = (unsigned long)(*pages);
+ struct vm_area_struct *vma;
+ struct page *page;
+ int err = -EFAULT;
+
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start)
+ goto set_status;
+
+ /* FOLL_DUMP to ignore special (like zero) pages */
+ page = follow_page(vma, addr, FOLL_DUMP);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
+ err = page ? page_to_nid(page) : -ENOENT;
+set_status:
+ *status = err;
+
+ pages++;
+ status++;
+ }
+
+ mmap_read_unlock(mm);
+}
+
+/*
+ * Determine the nodes of a user array of pages and store it in
+ * a user array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user * __user *pages,
+ int __user *status)
+{
+#define DO_PAGES_STAT_CHUNK_NR 16
+ const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
+ int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+
+ while (nr_pages) {
+ unsigned long chunk_nr;
+
+ chunk_nr = nr_pages;
+ if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
+ chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+
+ if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
+ break;
+
+ do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+
+ if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ break;
+
+ pages += chunk_nr;
+ status += chunk_nr;
+ nr_pages -= chunk_nr;
+ }
+ return nr_pages ? -EFAULT : 0;
+}
+
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
+
+ /* Find the mm_struct */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return ERR_PTR(-ESRCH);
+ }
+ get_task_struct(task);
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ mm = ERR_PTR(-EPERM);
+ goto out;
+ }
+ rcu_read_unlock();
+
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
+ goto out;
+ *mem_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+out:
+ put_task_struct(task);
+ if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct mm_struct *mm;
+ int err;
+ nodemask_t task_nodes;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+ return -EINVAL;
+
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ if (nodes)
+ err = do_pages_move(mm, task_nodes, nr_pages, pages,
+ nodes, status, flags);
+ else
+ err = do_pages_stat(mm, nr_pages, pages, status);
+
+ mmput(mm);
+ return err;
+}
+
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
+ const void __user * __user *, pages,
+ const int __user *, nodes,
+ int __user *, status, int, flags)
+{
+ return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
+{
+ const void __user * __user *pages;
+ int i;
+
+ pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+ for (i = 0; i < nr_pages; i++) {
+ compat_uptr_t p;
+
+ if (get_user(p, pages32 + i) ||
+ put_user(compat_ptr(p), pages + i))
+ return -EFAULT;
+ }
+ return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ unsigned long nr_migrate_pages)
+{
+ int z;
+
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ ZONE_MOVABLE, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = __alloc_pages_node(nid,
+ (GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE | __GFP_NOMEMALLOC |
+ __GFP_NORETRY | __GFP_NOWARN) &
+ ~__GFP_RECLAIM, 0);
+
+ return newpage;
+}
+
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int page_lru;
+
+ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+
+ /* Avoid migrating to a node that is nearly full */
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
+ return 0;
+
+ if (isolate_lru_page(page))
+ return 0;
+
+ /*
+ * migrate_misplaced_transhuge_page() skips page migration's usual
+ * check on page_count(), so we must do it here, now that the page
+ * has been isolated: a GUP pin, or any other pin, prevents migration.
+ * The expected page count is 3: 1 for page's mapcount and 1 for the
+ * caller's pin and 1 for the reference taken by isolate_lru_page().
+ */
+ if (PageTransHuge(page) && page_count(page) != 3) {
+ putback_lru_page(page);
+ return 0;
+ }
+
+ page_lru = page_is_file_lru(page);
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
+ thp_nr_pages(page));
+
+ /*
+ * Isolating the page has taken another reference, so the
+ * caller's reference can be safely dropped without the page
+ * disappearing underneath us during migration.
+ */
+ put_page(page);
+ return 1;
+}
+
+bool pmd_trans_migrating(pmd_t pmd)
+{
+ struct page *page = pmd_page(pmd);
+ return PageLocked(page);
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+ int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate file pages that are mapped in multiple processes
+ * with execute permissions as they are probably shared libraries.
+ */
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+ (vma->vm_flags & VM_EXEC))
+ goto out;
+
+ /*
+ * Also do not migrate dirty pages as not all filesystems can move
+ * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+ */
+ if (page_is_file_lru(page) && PageDirty(page))
+ goto out;
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ if (!list_empty(&migratepages)) {
+ list_del(&page->lru);
+ dec_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+ putback_lru_page(page);
+ }
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+ return isolated;
+
+out:
+ put_page(page);
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * Migrates a THP to a given target node. page must be locked and is unlocked
+ * before returning.
+ */
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ spinlock_t *ptl;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ int page_lru = page_is_file_lru(page);
+ unsigned long start = address & HPAGE_PMD_MASK;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
+ HPAGE_PMD_ORDER);
+ if (!new_page)
+ goto out_fail;
+ prep_transhuge_page(new_page);
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ put_page(new_page);
+ goto out_fail;
+ }
+
+ /* Prepare a page as a migration target */
+ __SetPageLocked(new_page);
+ if (PageSwapBacked(page))
+ __SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
+ spin_unlock(ptl);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ /* Retake the callers reference and putback on LRU */
+ get_page(page);
+ putback_lru_page(page);
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
+
+ goto out_unlock;
+ }
+
+ entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+ /*
+ * Overwrite the old entry under pagetable lock and establish
+ * the new PTE. Any parallel GUP will either observe the old
+ * page blocking on the page lock, block on the page table
+ * lock or observe the new page. The SetPageUptodate on the
+ * new page and page_add_new_anon_rmap guarantee the copy is
+ * visible before the pagetable update.
+ */
+ page_add_anon_rmap(new_page, vma, start, true);
+ /*
+ * At this point the pmd is numa/protnone (i.e. non present) and the TLB
+ * has already been flushed globally. So no TLB can be currently
+ * caching this non present pmd mapping. There's no need to clear the
+ * pmd before doing set_pmd_at(), nor to flush the TLB after
+ * set_pmd_at(). Clearing the pmd here would introduce a race
+ * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
+ * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
+ * pmd.
+ */
+ set_pmd_at(mm, start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+
+ page_ref_unfreeze(page, 2);
+ mlock_migrate_page(new_page, page);
+ page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
+
+ spin_unlock(ptl);
+
+ /* Take an "isolate" reference and put new page on the LRU. */
+ get_page(new_page);
+ putback_lru_page(new_page);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_fail:
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_same(*pmd, entry)) {
+ entry = pmd_modify(entry, vma->vm_page_prot);
+ set_pmd_at(mm, start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ }
+ spin_unlock(ptl);
+
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEVICE_PRIVATE
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ __always_unused int depth,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn = 0, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+
+ if (pte_none(pte)) {
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ pfn = pte_pfn(pte);
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ goto next;
+ }
+ page = vm_normal_page(migrate->vma, addr, pte);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = 0;
+ goto next;
+ }
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+ migrate->cpages++;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ mpfn |= MIGRATE_PFN_LOCKED;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ entry = make_migration_entry(page, mpfn &
+ MIGRATE_PFN_WRITE);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ return 0;
+}
+
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mmu_notifier_range range;
+
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ return is_device_private_page(page);
+ }
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ bool remap = true;
+
+ if (!page)
+ continue;
+
+ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
+ /*
+ * Because we are migrating several pages there can be
+ * a deadlock between 2 concurrent migration where each
+ * are waiting on each other page lock.
+ *
+ * Make migrate_vma() a best effort thing and backoff
+ * for any page we can not lock right away.
+ */
+ if (!trylock_page(page)) {
+ migrate->src[i] = 0;
+ migrate->cpages--;
+ put_page(page);
+ continue;
+ }
+ remap = false;
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
+ }
+ }
+ }
+
+ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_pte(page, migrate->vma, addr, page);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ put_page(page);
+ restore--;
+ }
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (page_mapped(page)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page))
+ goto restore;
+ }
+
+ if (migrate_vma_check_page(page))
+ continue;
+
+restore:
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
+ * (destination pages must have their struct pages locked, via lock_page()).
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unback virtual
+ * address. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entries after calling migrate_vma_pages()
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_lock is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_prepare(args);
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ unsigned long *dst)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have mmap_read_lock(mm).
+ */
+ if (pte_alloc(mm, pmdp))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+ if (!notified) {
+ notified = true;
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
+ migrate->vma->vm_mm,
+ addr, migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i],
+ &migrate->dst[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+ * did already call it.
+ */
+ if (notified)
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ remove_migration_ptes(page, newpage, false);
+ unlock_page(page);
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+EXPORT_SYMBOL(migrate_vma_finalize);
+#endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/mincore.c b/mm/mincore.c
new file mode 100644
index 000000000..02db1a834
--- /dev/null
+++ b/mm/mincore.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/mincore.c
+ *
+ * Copyright (C) 1994-2006 Linus Torvalds
+ */
+
+/*
+ * The mincore() system call.
+ */
+#include <linux/pagemap.h>
+#include <linux/gfp.h>
+#include <linux/pagewalk.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
+#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
+
+#include <linux/uaccess.h>
+
+static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned char present;
+ unsigned char *vec = walk->private;
+
+ /*
+ * Hugepages under user process are always in RAM and never
+ * swapped out, but theoretically it needs to be checked.
+ */
+ present = pte && !huge_pte_none(huge_ptep_get(pte));
+ for (; addr != end; vec++, addr += PAGE_SIZE)
+ *vec = present;
+ walk->private = vec;
+#else
+ BUG();
+#endif
+ return 0;
+}
+
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
+{
+ unsigned char present = 0;
+ struct page *page;
+
+ /*
+ * When tmpfs swaps out a page from a file, any process mapping that
+ * file will not get a swp_entry_t in its pte, but rather it is like
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ */
+ page = find_get_incore_page(mapping, index);
+ if (page) {
+ present = PageUptodate(page);
+ put_page(page);
+ }
+
+ return present;
+}
+
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct vm_area_struct *vma, unsigned char *vec)
+{
+ unsigned long nr = (end - addr) >> PAGE_SHIFT;
+ int i;
+
+ if (vma->vm_file) {
+ pgoff_t pgoff;
+
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
+ }
+ return nr;
+}
+
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ __always_unused int depth,
+ struct mm_walk *walk)
+{
+ walk->private += __mincore_unmapped_range(addr, end,
+ walk->vma, walk->private);
+ return 0;
+}
+
+static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ spinlock_t *ptl;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *ptep;
+ unsigned char *vec = walk->private;
+ int nr = (end - addr) >> PAGE_SHIFT;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ memset(vec, 1, nr);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmd)) {
+ __mincore_unmapped_range(addr, end, vma, vec);
+ goto out;
+ }
+
+ ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ pte_t pte = *ptep;
+
+ if (pte_none(pte))
+ __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+ vma, vec);
+ else if (pte_present(pte))
+ *vec = 1;
+ else { /* pte is a swap entry */
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (non_swap_entry(entry)) {
+ /*
+ * migration or hwpoison entries are always
+ * uptodate
+ */
+ *vec = 1;
+ } else {
+#ifdef CONFIG_SWAP
+ *vec = mincore_page(swap_address_space(entry),
+ swp_offset(entry));
+#else
+ WARN_ON(1);
+ *vec = 1;
+#endif
+ }
+ }
+ vec++;
+ }
+ pte_unmap_unlock(ptep - 1, ptl);
+out:
+ walk->private += nr;
+ cond_resched();
+ return 0;
+}
+
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static const struct mm_walk_ops mincore_walk_ops = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+};
+
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+ int err;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
+ end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
+ if (err < 0)
+ return err;
+ return (end - addr) >> PAGE_SHIFT;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes. The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information. Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - vec points to an illegal address
+ * -EINVAL - addr is not a multiple of PAGE_SIZE
+ * -ENOMEM - Addresses in the range [addr, addr + len] are
+ * invalid for the address space of this process, or
+ * specify one or more pages which are not currently
+ * mapped
+ * -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+ unsigned char __user *, vec)
+{
+ long retval;
+ unsigned long pages;
+ unsigned char *tmp;
+
+ start = untagged_addr(start);
+
+ /* Check the start address: needs to be page-aligned.. */
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ /* ..and we need to be passed a valid user-space range */
+ if (!access_ok((void __user *) start, len))
+ return -ENOMEM;
+
+ /* This also avoids any overflows on PAGE_ALIGN */
+ pages = len >> PAGE_SHIFT;
+ pages += (offset_in_page(len)) != 0;
+
+ if (!access_ok(vec, pages))
+ return -EFAULT;
+
+ tmp = (void *) __get_free_page(GFP_USER);
+ if (!tmp)
+ return -EAGAIN;
+
+ retval = 0;
+ while (pages) {
+ /*
+ * Do at most PAGE_SIZE entries per iteration, due to
+ * the temporary buffer size.
+ */
+ mmap_read_lock(current->mm);
+ retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+ mmap_read_unlock(current->mm);
+
+ if (retval <= 0)
+ break;
+ if (copy_to_user(vec, tmp, retval)) {
+ retval = -EFAULT;
+ break;
+ }
+ pages -= retval;
+ vec += retval;
+ start += retval << PAGE_SHIFT;
+ retval = 0;
+ }
+ free_page((unsigned long) tmp);
+ return retval;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
new file mode 100644
index 000000000..884b1216d
--- /dev/null
+++ b/mm/mlock.c
@@ -0,0 +1,878 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/mlock.c
+ *
+ * (C) Copyright 1995 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
+ */
+
+#include <linux/capability.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/sched/user.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#include "internal.h"
+
+bool can_do_mlock(void)
+{
+ if (rlimit(RLIMIT_MEMLOCK) != 0)
+ return true;
+ if (capable(CAP_IPC_LOCK))
+ return true;
+ return false;
+}
+EXPORT_SYMBOL(can_do_mlock);
+
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_lock for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+
+/*
+ * LRU accounting for clear_page_mlock()
+ */
+void clear_page_mlock(struct page *page)
+{
+ int nr_pages;
+
+ if (!TestClearPageMlocked(page))
+ return;
+
+ nr_pages = thp_nr_pages(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ /*
+ * The previous TestClearPageMlocked() corresponds to the smp_mb()
+ * in __pagevec_lru_add_fn().
+ *
+ * See __pagevec_lru_add_fn for more explanation.
+ */
+ if (!isolate_lru_page(page)) {
+ putback_lru_page(page);
+ } else {
+ /*
+ * We lost the race. the page already moved to evictable list.
+ */
+ if (PageUnevictable(page))
+ count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
+}
+
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+ /* Serialize with page migration */
+ BUG_ON(!PageLocked(page));
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
+ if (!TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+ if (!isolate_lru_page(page))
+ putback_lru_page(page);
+ }
+}
+
+/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (getpage)
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Finish munlock after successful page isolation
+ *
+ * Page must be locked. This is a wrapper for try_to_munlock()
+ * and putback_lru_page() with munlock accounting.
+ */
+static void __munlock_isolated_page(struct page *page)
+{
+ /*
+ * Optimization: if the page was mapped just once, that's our mapping
+ * and we don't need to check all the other vmas.
+ */
+ if (page_mapcount(page) > 1)
+ try_to_munlock(page);
+
+ /* Did try_to_unlock() succeed or punt? */
+ if (!PageMlocked(page))
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
+
+ putback_lru_page(page);
+}
+
+/*
+ * Accounting for page isolation fail during munlock
+ *
+ * Performs accounting when page isolation fails in munlock. There is nothing
+ * else to do because it means some other task has already removed the page
+ * from the LRU. putback_lru_page() will take care of removing the page from
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
+ * the page back to the unevictable list if some other vma has it mlocked.
+ */
+static void __munlock_isolation_failed(struct page *page)
+{
+ int nr_pages = thp_nr_pages(page);
+
+ if (PageUnevictable(page))
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+}
+
+/**
+ * munlock_vma_page - munlock a vma page
+ * @page: page to be unlocked, either a normal page or THP page head
+ *
+ * returns the size of the page as a page mask (0 for normal page,
+ * HPAGE_PMD_NR - 1 for THP head page)
+ *
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ * When we munlock a page, because the vma where we found the page is being
+ * munlock()ed or munmap()ed, we want to check whether other vmas hold the
+ * page locked so that we can leave it on the unevictable lru list and not
+ * bother vmscan with it. However, to walk the page's rmap list in
+ * try_to_munlock() we must isolate the page from the LRU. If some other
+ * task has removed the page from the LRU, we won't be able to do that.
+ * So we clear the PageMlocked as we might not get another chance. If we
+ * can't isolate the page, we leave it for putback_lru_page() and vmscan
+ * [page_referenced()/try_to_unmap()] to deal with.
+ */
+unsigned int munlock_vma_page(struct page *page)
+{
+ int nr_pages;
+ pg_data_t *pgdat = page_pgdat(page);
+
+ /* For try_to_munlock() and to serialize with page migration */
+ BUG_ON(!PageLocked(page));
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ /*
+ * Serialize with any parallel __split_huge_page_refcount() which
+ * might otherwise copy PageMlocked to part of the tail pages before
+ * we clear it in the head page. It also stabilizes thp_nr_pages().
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+
+ if (!TestClearPageMlocked(page)) {
+ /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
+ nr_pages = 1;
+ goto unlock_out;
+ }
+
+ nr_pages = thp_nr_pages(page);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+
+ if (__munlock_isolate_lru_page(page, true)) {
+ spin_unlock_irq(&pgdat->lru_lock);
+ __munlock_isolated_page(page);
+ goto out;
+ }
+ __munlock_isolation_failed(page);
+
+unlock_out:
+ spin_unlock_irq(&pgdat->lru_lock);
+
+out:
+ return nr_pages - 1;
+}
+
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
+/*
+ * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
+ *
+ * The fast path is available only for evictable pages with single mapping.
+ * Then we can bypass the per-cpu pvec and get better performance.
+ * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when !page_evictable(), we need the full redo logic of putback_lru_page to
+ * avoid leaving evictable page in unevictable list.
+ *
+ * In case of success, @page is added to @pvec and @pgrescued is incremented
+ * in case that the page was previously unevictable. @page is also unlocked.
+ */
+static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
+ int *pgrescued)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (page_mapcount(page) <= 1 && page_evictable(page)) {
+ pagevec_add(pvec, page);
+ if (TestClearPageUnevictable(page))
+ (*pgrescued)++;
+ unlock_page(page);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Putback multiple evictable pages to the LRU
+ *
+ * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
+ * the pages might have meanwhile become unevictable but that is OK.
+ */
+static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+{
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
+ /*
+ *__pagevec_lru_add() calls release_pages() so we don't call
+ * put_page() explicitly
+ */
+ __pagevec_lru_add(pvec);
+ count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+}
+
+/*
+ * Munlock a batch of pages from the same zone
+ *
+ * The work is split to two main phases. First phase clears the Mlocked flag
+ * and attempts to isolate the pages, all under a single zone lru lock.
+ * The second phase finishes the munlock only for pages where isolation
+ * succeeded.
+ *
+ * Note that the pagevec may be modified during the process.
+ */
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+{
+ int i;
+ int nr = pagevec_count(pvec);
+ int delta_munlocked = -nr;
+ struct pagevec pvec_putback;
+ int pgrescued = 0;
+
+ pagevec_init(&pvec_putback);
+
+ /* Phase 1: page isolation */
+ spin_lock_irq(&zone->zone_pgdat->lru_lock);
+ for (i = 0; i < nr; i++) {
+ struct page *page = pvec->pages[i];
+
+ if (TestClearPageMlocked(page)) {
+ /*
+ * We already have pin from follow_page_mask()
+ * so we can spare the get_page() here.
+ */
+ if (__munlock_isolate_lru_page(page, false))
+ continue;
+ else
+ __munlock_isolation_failed(page);
+ } else {
+ delta_munlocked++;
+ }
+
+ /*
+ * We won't be munlocking this page in the next phase
+ * but we still need to release the follow_page_mask()
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release() would deadlock.
+ */
+ pagevec_add(&pvec_putback, pvec->pages[i]);
+ pvec->pages[i] = NULL;
+ }
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+
+ /* Now we can release pins of pages that we are not munlocking */
+ pagevec_release(&pvec_putback);
+
+ /* Phase 2: page munlock */
+ for (i = 0; i < nr; i++) {
+ struct page *page = pvec->pages[i];
+
+ if (page) {
+ lock_page(page);
+ if (!__putback_lru_fast_prepare(page, &pvec_putback,
+ &pgrescued)) {
+ /*
+ * Slow path. We don't want to lose the last
+ * pin before unlock_page()
+ */
+ get_page(page); /* for putback_lru_page() */
+ __munlock_isolated_page(page);
+ unlock_page(page);
+ put_page(page); /* from follow_page_mask() */
+ }
+ }
+ }
+
+ /*
+ * Phase 3: page putback for pages that qualified for the fast path
+ * This will also call put_page() to return pin from follow_page_mask()
+ */
+ if (pagevec_count(&pvec_putback))
+ __putback_lru_fast(&pvec_putback, pgrescued);
+}
+
+/*
+ * Fill up pagevec for __munlock_pagevec using pte walk
+ *
+ * The function expects that the struct page corresponding to @start address is
+ * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
+ *
+ * The rest of @pvec is filled by subsequent pages within the same pmd and same
+ * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * pages also get pinned.
+ *
+ * Returns the address of the next page that should be scanned. This equals
+ * @start + PAGE_SIZE when no page could be added by the pte walk.
+ */
+static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
+ struct vm_area_struct *vma, struct zone *zone,
+ unsigned long start, unsigned long end)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ /*
+ * Initialize pte walk starting at the already pinned page where we
+ * are sure that there is a pte, as it was pinned under the same
+ * mmap_lock write op.
+ */
+ pte = get_locked_pte(vma->vm_mm, start, &ptl);
+ /* Make sure we do not cross the page table boundary */
+ end = pgd_addr_end(start, end);
+ end = p4d_addr_end(start, end);
+ end = pud_addr_end(start, end);
+ end = pmd_addr_end(start, end);
+
+ /* The page next to the pinned page is the first we will try to get */
+ start += PAGE_SIZE;
+ while (start < end) {
+ struct page *page = NULL;
+ pte++;
+ if (pte_present(*pte))
+ page = vm_normal_page(vma, start, *pte);
+ /*
+ * Break if page could not be obtained or the page's node+zone does not
+ * match
+ */
+ if (!page || page_zone(page) != zone)
+ break;
+
+ /*
+ * Do not use pagevec for PTE-mapped THP,
+ * munlock_vma_pages_range() will handle them.
+ */
+ if (PageTransCompound(page))
+ break;
+
+ get_page(page);
+ /*
+ * Increase the address that will be returned *before* the
+ * eventual break due to pvec becoming full by adding the page
+ */
+ start += PAGE_SIZE;
+ if (pagevec_add(pvec, page) == 0)
+ break;
+ }
+ pte_unmap_unlock(pte, ptl);
+ return start;
+}
+
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ * For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared. Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru. In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them. This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+ while (start < end) {
+ struct page *page;
+ unsigned int page_mask = 0;
+ unsigned long page_increm;
+ struct pagevec pvec;
+ struct zone *zone;
+
+ pagevec_init(&pvec);
+ /*
+ * Although FOLL_DUMP is intended for get_dump_page(),
+ * it just so happens that its special treatment of the
+ * ZERO_PAGE (returning an error instead of doing get_page)
+ * suits munlock very well (and if somehow an abnormal page
+ * has sneaked into the range, we won't oops here: great).
+ */
+ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
+
+ if (page && !IS_ERR(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
+ lock_page(page);
+ /*
+ * Any THP page found by follow_page_mask() may
+ * have gotten split before reaching
+ * munlock_vma_page(), so we need to compute
+ * the page_mask here instead.
+ */
+ page_mask = munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page); /* follow_page_mask() */
+ } else {
+ /*
+ * Non-huge pages are handled in batches via
+ * pagevec. The pin from follow_page_mask()
+ * prevents them from collapsing by THP.
+ */
+ pagevec_add(&pvec, page);
+ zone = page_zone(page);
+
+ /*
+ * Try to fill the rest of pagevec using fast
+ * pte walk. This will also update start to
+ * the next page to process. Then munlock the
+ * pagevec.
+ */
+ start = __munlock_pagevec_fill(&pvec, vma,
+ zone, start, end);
+ __munlock_pagevec(&pvec, zone);
+ goto next;
+ }
+ }
+ page_increm = 1 + page_mask;
+ start += page_increm * PAGE_SIZE;
+next:
+ cond_resched();
+ }
+}
+
+/*
+ * mlock_fixup - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op. However, for some special vmas, we go ahead and
+ * populate the ptes.
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
+static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, vm_flags_t newflags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgoff_t pgoff;
+ int nr_pages;
+ int ret = 0;
+ int lock = !!(newflags & VM_LOCKED);
+ vm_flags_t old_flags = vma->vm_flags;
+
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
+ vma_is_dax(vma))
+ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+ goto out;
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ if (start != vma->vm_start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ goto out;
+ }
+
+success:
+ /*
+ * Keep track of amount of locked VM.
+ */
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!lock)
+ nr_pages = -nr_pages;
+ else if (old_flags & VM_LOCKED)
+ nr_pages = 0;
+ mm->locked_vm += nr_pages;
+
+ /*
+ * vm_flags is protected by the mmap_lock held in write mode.
+ * It's okay if try_to_unmap_one unmaps a page just after we
+ * set VM_LOCKED, populate_vma_page_range will bring it back.
+ */
+
+ if (lock)
+ vma->vm_flags = newflags;
+ else
+ munlock_vma_pages_range(vma, start, end);
+
+out:
+ *prev = vma;
+ return ret;
+}
+
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+ vm_flags_t flags)
+{
+ unsigned long nstart, end, tmp;
+ struct vm_area_struct * vma, * prev;
+ int error;
+
+ VM_BUG_ON(offset_in_page(start));
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ vma = find_vma(current->mm, start);
+ if (!vma || vma->vm_start > start)
+ return -ENOMEM;
+
+ prev = vma->vm_prev;
+ if (start > vma->vm_start)
+ prev = vma;
+
+ for (nstart = start ; ; ) {
+ vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+
+ newflags |= flags;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+ error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
+ if (error)
+ break;
+ nstart = tmp;
+ if (nstart < prev->vm_end)
+ nstart = prev->vm_end;
+ if (nstart >= end)
+ break;
+
+ vma = prev->vm_next;
+ if (!vma || vma->vm_start != nstart) {
+ error = -ENOMEM;
+ break;
+ }
+ }
+ return error;
+}
+
+/*
+ * Go through vma areas and sum size of mlocked
+ * vma pages, as return value.
+ * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
+ * is also counted.
+ * Return value: previously mlocked page counts
+ */
+static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
+ unsigned long start, size_t len)
+{
+ struct vm_area_struct *vma;
+ unsigned long count = 0;
+
+ if (mm == NULL)
+ mm = current->mm;
+
+ vma = find_vma(mm, start);
+ if (vma == NULL)
+ vma = mm->mmap;
+
+ for (; vma ; vma = vma->vm_next) {
+ if (start >= vma->vm_end)
+ continue;
+ if (start + len <= vma->vm_start)
+ break;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (start > vma->vm_start)
+ count -= (start - vma->vm_start);
+ if (start + len < vma->vm_end) {
+ count += start + len - vma->vm_start;
+ break;
+ }
+ count += vma->vm_end - vma->vm_start;
+ }
+ }
+
+ return count >> PAGE_SHIFT;
+}
+
+static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
+{
+ unsigned long locked;
+ unsigned long lock_limit;
+ int error = -ENOMEM;
+
+ start = untagged_addr(start);
+
+ if (!can_do_mlock())
+ return -EPERM;
+
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
+ start &= PAGE_MASK;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = len >> PAGE_SHIFT;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+
+ locked += current->mm->locked_vm;
+ if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
+ /*
+ * It is possible that the regions requested intersect with
+ * previously mlocked areas, that part area in "mm->locked_vm"
+ * should not be counted to new mlock increment count. So check
+ * and adjust locked count if necessary.
+ */
+ locked -= count_mm_mlocked_page_nr(current->mm,
+ start, len);
+ }
+
+ /* check against resource limits */
+ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+ error = apply_vma_lock_flags(start, len, flags);
+
+ mmap_write_unlock(current->mm);
+ if (error)
+ return error;
+
+ error = __mm_populate(start, len, 0);
+ if (error)
+ return __mlock_posix_error_return(error);
+ return 0;
+}
+
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+ return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+ vm_flags_t vm_flags = VM_LOCKED;
+
+ if (flags & ~MLOCK_ONFAULT)
+ return -EINVAL;
+
+ if (flags & MLOCK_ONFAULT)
+ vm_flags |= VM_LOCKONFAULT;
+
+ return do_mlock(start, len, vm_flags);
+}
+
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
+{
+ int ret;
+
+ start = untagged_addr(start);
+
+ len = PAGE_ALIGN(len + (offset_in_page(start)));
+ start &= PAGE_MASK;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ ret = apply_vma_lock_flags(start, len, 0);
+ mmap_write_unlock(current->mm);
+
+ return ret;
+}
+
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this. If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack. If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
+static int apply_mlockall_flags(int flags)
+{
+ struct vm_area_struct * vma, * prev = NULL;
+ vm_flags_t to_add = 0;
+
+ current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+ if (flags & MCL_FUTURE) {
+ current->mm->def_flags |= VM_LOCKED;
+
+ if (flags & MCL_ONFAULT)
+ current->mm->def_flags |= VM_LOCKONFAULT;
+
+ if (!(flags & MCL_CURRENT))
+ goto out;
+ }
+
+ if (flags & MCL_CURRENT) {
+ to_add |= VM_LOCKED;
+ if (flags & MCL_ONFAULT)
+ to_add |= VM_LOCKONFAULT;
+ }
+
+ for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ vm_flags_t newflags;
+
+ newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ newflags |= to_add;
+
+ /* Ignore errors */
+ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ cond_resched();
+ }
+out:
+ return 0;
+}
+
+SYSCALL_DEFINE1(mlockall, int, flags)
+{
+ unsigned long lock_limit;
+ int ret;
+
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
+ flags == MCL_ONFAULT)
+ return -EINVAL;
+
+ if (!can_do_mlock())
+ return -EPERM;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+
+ ret = -ENOMEM;
+ if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
+ capable(CAP_IPC_LOCK))
+ ret = apply_mlockall_flags(flags);
+ mmap_write_unlock(current->mm);
+ if (!ret && (flags & MCL_CURRENT))
+ mm_populate(0, TASK_SIZE);
+
+ return ret;
+}
+
+SYSCALL_DEFINE0(munlockall)
+{
+ int ret;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ ret = apply_mlockall_flags(0);
+ mmap_write_unlock(current->mm);
+ return ret;
+}
+
+/*
+ * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
+ * shm segments) get accounted against the user_struct instead.
+ */
+static DEFINE_SPINLOCK(shmlock_user_lock);
+
+int user_shm_lock(size_t size, struct user_struct *user)
+{
+ unsigned long lock_limit, locked;
+ int allowed = 0;
+
+ locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ if (lock_limit == RLIM_INFINITY)
+ allowed = 1;
+ lock_limit >>= PAGE_SHIFT;
+ spin_lock(&shmlock_user_lock);
+ if (!allowed &&
+ locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ goto out;
+ get_uid(user);
+ user->locked_shm += locked;
+ allowed = 1;
+out:
+ spin_unlock(&shmlock_user_lock);
+ return allowed;
+}
+
+void user_shm_unlock(size_t size, struct user_struct *user)
+{
+ spin_lock(&shmlock_user_lock);
+ user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ spin_unlock(&shmlock_user_lock);
+ free_uid(user);
+}
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000..b06a30fbe
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/mman.h>
+#include "internal.h"
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int __meminitdata mminit_loglevel;
+
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT 0
+#endif
+
+/* The zonelists are simply reported, validation is manual. */
+void __init mminit_verify_zonelist(void)
+{
+ int nid;
+
+ if (mminit_loglevel < MMINIT_VERIFY)
+ return;
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ struct zoneref *z;
+ struct zonelist *zonelist;
+ int i, listid, zoneid;
+
+ BUILD_BUG_ON(MAX_ZONELISTS > 2);
+ for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+
+ /* Identify the zone and nodelist */
+ zoneid = i % MAX_NR_ZONES;
+ listid = i / MAX_NR_ZONES;
+ zonelist = &pgdat->node_zonelists[listid];
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Print information about the zonelist */
+ printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+ listid > 0 ? "thisnode" : "general", nid,
+ zone->name);
+
+ /* Iterate the zonelist */
+ for_each_zone_zonelist(zone, z, zonelist, zoneid)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+ pr_cont("\n");
+ }
+ }
+}
+
+void __init mminit_verify_pageflags_layout(void)
+{
+ int shift, width;
+ unsigned long or_mask, add_mask;
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
+ SECTIONS_SHIFT,
+ NODES_SHIFT,
+ ZONES_SHIFT,
+ LAST_CPUPID_SHIFT,
+ KASAN_TAG_WIDTH);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
+ (unsigned long)SECTIONS_PGSHIFT,
+ (unsigned long)NODES_PGSHIFT,
+ (unsigned long)ZONES_PGSHIFT,
+ (unsigned long)LAST_CPUPID_PGSHIFT,
+ (unsigned long)KASAN_TAG_PGSHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
+ "Node/Zone ID: %lu -> %lu\n",
+ (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
+ (unsigned long)ZONEID_PGOFF);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+ "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
+ shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Node not in page flags");
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Last cpupid not in page flags");
+#endif
+
+ if (SECTIONS_WIDTH) {
+ shift -= SECTIONS_WIDTH;
+ BUG_ON(shift != SECTIONS_PGSHIFT);
+ }
+ if (NODES_WIDTH) {
+ shift -= NODES_WIDTH;
+ BUG_ON(shift != NODES_PGSHIFT);
+ }
+ if (ZONES_WIDTH) {
+ shift -= ZONES_WIDTH;
+ BUG_ON(shift != ZONES_PGSHIFT);
+ }
+
+ /* Check for bitmask overlaps */
+ or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+ (NODES_MASK << NODES_PGSHIFT) |
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+ (NODES_MASK << NODES_PGSHIFT) +
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ BUG_ON(or_mask != add_mask);
+}
+
+static __init int set_mminit_loglevel(char *str)
+{
+ get_option(&str, &mminit_loglevel);
+ return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+
+void mm_compute_batch(int overcommit_policy)
+{
+ u64 memsized_batch;
+ s32 nr = num_present_cpus();
+ s32 batch = max_t(s32, nr*2, 32);
+ unsigned long ram_pages = totalram_pages();
+
+ /*
+ * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+ * (total memory/#cpus), and lift it to 25% for other policies
+ * to easy the possible lock contention for percpu_counter
+ * vm_committed_as, while the max limit is INT_MAX
+ */
+ if (overcommit_policy == OVERCOMMIT_NEVER)
+ memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+ else
+ memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
+
+ vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ mm_compute_batch(sysctl_overcommit_memory);
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block compute_batch_nb __meminitdata = {
+ .notifier_call = mm_compute_batch_notifier,
+ .priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+
+static int __init mm_compute_batch_init(void)
+{
+ mm_compute_batch(sysctl_overcommit_memory);
+ register_hotmemory_notifier(&compute_batch_nb);
+
+ return 0;
+}
+
+__initcall(mm_compute_batch_init);
+
+#endif
+
+static int __init mm_sysfs_init(void)
+{
+ mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+ if (!mm_kobj)
+ return -ENOMEM;
+
+ return 0;
+}
+postcore_initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
new file mode 100644
index 000000000..33ebda838
--- /dev/null
+++ b/mm/mmap.c
@@ -0,0 +1,3854 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/mmap.c
+ *
+ * Written by obz.
+ *
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/profile.h>
+#include <linux/export.h>
+#include <linux/mount.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmdebug.h>
+#include <linux/perf_event.h>
+#include <linux/audit.h>
+#include <linux/khugepaged.h>
+#include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
+#include <linux/pkeys.h>
+#include <linux/oom.h>
+#include <linux/sched/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap.h>
+
+#include "internal.h"
+
+#ifndef arch_mmap_check
+#define arch_mmap_check(addr, len, flags) (0)
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+static bool ignore_rlimit_data;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
+
+static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end);
+
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware. The expected
+ * behavior is in parens:
+ *
+ * map_type prot
+ * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
+ * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
+ * w: (no) no w: (no) no w: (yes) yes w: (no) no
+ * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ *
+ * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
+ * w: (no) no w: (no) no w: (copy) copy w: (no) no
+ * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
+ */
+pgprot_t protection_map[16] __ro_after_init = {
+ __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
+ __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
+};
+
+#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return prot;
+}
+#endif
+
+pgprot_t vm_get_page_prot(unsigned long vm_flags)
+{
+ pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
+
+ return arch_filter_pgprot(ret);
+}
+EXPORT_SYMBOL(vm_get_page_prot);
+
+static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+{
+ return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
+}
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+void vma_set_page_prot(struct vm_area_struct *vma)
+{
+ unsigned long vm_flags = vma->vm_flags;
+ pgprot_t vm_page_prot;
+
+ vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
+ if (vma_wants_writenotify(vma, vm_page_prot)) {
+ vm_flags &= ~VM_SHARED;
+ vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
+ }
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
+ WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
+}
+
+/*
+ * Requires inode->i_mapping->i_mmap_rwsem
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+ struct file *file, struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ if (vma->vm_flags & VM_SHARED)
+ mapping_unmap_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Unlink a file-based vm structure from its interval tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
+ */
+void unlink_file_vma(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+ i_mmap_lock_write(mapping);
+ __remove_shared_vm_struct(vma, file, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
+ vm_area_free(vma);
+ return next;
+}
+
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
+ struct list_head *uf);
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+ unsigned long retval;
+ unsigned long newbrk, oldbrk, origbrk;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *next;
+ unsigned long min_brk;
+ bool populate;
+ bool downgraded = false;
+ LIST_HEAD(uf);
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ origbrk = mm->brk;
+
+#ifdef CONFIG_COMPAT_BRK
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (current->brk_randomized)
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
+ goto out;
+
+ /*
+ * Check against rlimit here. If this check is done later after the test
+ * of oldbrk with newbrk then it can escape the test and let the data
+ * segment grow beyond its set limit the in case where the limit is
+ * not page aligned -Ram Gupta
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+ mm->end_data, mm->start_data))
+ goto out;
+
+ newbrk = PAGE_ALIGN(brk);
+ oldbrk = PAGE_ALIGN(mm->brk);
+ if (oldbrk == newbrk) {
+ mm->brk = brk;
+ goto success;
+ }
+
+ /*
+ * Always allow shrinking brk.
+ * __do_munmap() may downgrade mmap_lock to read.
+ */
+ if (brk <= mm->brk) {
+ int ret;
+
+ /*
+ * mm->brk must to be protected by write mmap_lock so update it
+ * before downgrading mmap_lock. When __do_munmap() fails,
+ * mm->brk will be restored from origbrk.
+ */
+ mm->brk = brk;
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
+ if (ret < 0) {
+ mm->brk = origbrk;
+ goto out;
+ } else if (ret == 1) {
+ downgraded = true;
+ }
+ goto success;
+ }
+
+ /* Check against existing mmap mappings. */
+ next = find_vma(mm, oldbrk);
+ if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
+ goto out;
+
+ /* Ok, looks good - let it rip. */
+ if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ goto out;
+ mm->brk = brk;
+
+success:
+ populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+ if (downgraded)
+ mmap_read_unlock(mm);
+ else
+ mmap_write_unlock(mm);
+ userfaultfd_unmap_complete(mm, &uf);
+ if (populate)
+ mm_populate(oldbrk, newbrk - oldbrk);
+ return brk;
+
+out:
+ retval = origbrk;
+ mmap_write_unlock(mm);
+ return retval;
+}
+
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
+{
+ unsigned long gap, prev_end;
+
+ /*
+ * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+ * allow two stack_guard_gaps between them here, and when choosing
+ * an unmapped area; whereas when expanding we only require one.
+ * That's a little inconsistent, but keeps the code here simpler.
+ */
+ gap = vm_start_gap(vma);
+ if (vma->vm_prev) {
+ prev_end = vm_end_gap(vma->vm_prev);
+ if (gap > prev_end)
+ gap -= prev_end;
+ else
+ gap = 0;
+ }
+ return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
+ if (vma->vm_rb.rb_left) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ if (vma->vm_rb.rb_right) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ return max;
+}
+
+static int browse_rb(struct mm_struct *mm)
+{
+ struct rb_root *root = &mm->mm_rb;
+ int i = 0, j, bug = 0;
+ struct rb_node *nd, *pn = NULL;
+ unsigned long prev = 0, pend = 0;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ if (vma->vm_start < prev) {
+ pr_emerg("vm_start %lx < prev %lx\n",
+ vma->vm_start, prev);
+ bug = 1;
+ }
+ if (vma->vm_start < pend) {
+ pr_emerg("vm_start %lx < pend %lx\n",
+ vma->vm_start, pend);
+ bug = 1;
+ }
+ if (vma->vm_start > vma->vm_end) {
+ pr_emerg("vm_start %lx > vm_end %lx\n",
+ vma->vm_start, vma->vm_end);
+ bug = 1;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+ pr_emerg("free gap %lx, correct %lx\n",
+ vma->rb_subtree_gap,
+ vma_compute_subtree_gap(vma));
+ bug = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+ i++;
+ pn = nd;
+ prev = vma->vm_start;
+ pend = vma->vm_end;
+ }
+ j = 0;
+ for (nd = pn; nd; nd = rb_prev(nd))
+ j++;
+ if (i != j) {
+ pr_emerg("backwards %d, forwards %d\n", j, i);
+ bug = 1;
+ }
+ return bug ? -1 : i;
+}
+
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+ struct rb_node *nd;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ VM_BUG_ON_VMA(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
+ vma);
+ }
+}
+
+static void validate_mm(struct mm_struct *mm)
+{
+ int bug = 0;
+ int i = 0;
+ unsigned long highest_address = 0;
+ struct vm_area_struct *vma = mm->mmap;
+
+ while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
+
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
+ highest_address = vm_end_gap(vma);
+ vma = vma->vm_next;
+ i++;
+ }
+ if (i != mm->map_count) {
+ pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ if (highest_address != mm->highest_vm_end) {
+ pr_emerg("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
+ bug = 1;
+ }
+ i = browse_rb(mm);
+ if (i != mm->map_count) {
+ if (i != -1)
+ pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ VM_BUG_ON_MM(bug, mm);
+}
+#else
+#define validate_mm_rb(root, ignore) do { } while (0)
+#define validate_mm(mm) do { } while (0)
+#endif
+
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+ struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_gap)
+
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
+{
+ /*
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+ * a callback function that does exactly what we want.
+ */
+ vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+ struct rb_root *root)
+{
+ /* All rb_subtree_gap values must be consistent prior to insertion */
+ validate_mm_rb(root, NULL);
+
+ rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+ /*
+ * Note rb_erase_augmented is a fairly large inline function,
+ * so make sure we instantiate it only once with our desired
+ * augmented rbtree callbacks.
+ */
+ rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
+ struct rb_root *root,
+ struct vm_area_struct *ignore)
+{
+ /*
+ * All rb_subtree_gap values must be consistent prior to erase,
+ * with the possible exception of
+ *
+ * a. the "next" vma being erased if next->vm_start was reduced in
+ * __vma_adjust() -> __vma_unlink()
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
+ * vma_rb_erase()
+ */
+ validate_mm_rb(root, ignore);
+
+ __vma_rb_erase(vma, root);
+}
+
+static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
+ struct rb_root *root)
+{
+ vma_rb_erase_ignore(vma, root, vma);
+}
+
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_lock and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+ unsigned long end, struct vm_area_struct **pprev,
+ struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+ struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+
+ __rb_link = &mm->mm_rb.rb_node;
+ rb_prev = __rb_parent = NULL;
+
+ while (*__rb_link) {
+ struct vm_area_struct *vma_tmp;
+
+ __rb_parent = *__rb_link;
+ vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ /* Fail if an existing vma overlaps the area */
+ if (vma_tmp->vm_start < end)
+ return -ENOMEM;
+ __rb_link = &__rb_parent->rb_left;
+ } else {
+ rb_prev = __rb_parent;
+ __rb_link = &__rb_parent->rb_right;
+ }
+ }
+
+ *pprev = NULL;
+ if (rb_prev)
+ *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ *rb_link = __rb_link;
+ *rb_parent = __rb_parent;
+ return 0;
+}
+
+/*
+ * vma_next() - Get the next VMA.
+ * @mm: The mm_struct.
+ * @vma: The current vma.
+ *
+ * If @vma is NULL, return the first vma in the mm.
+ *
+ * Returns: The next VMA after @vma.
+ */
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ if (!vma)
+ return mm->mmap;
+
+ return vma->vm_next;
+}
+
+/*
+ * munmap_vma_range() - munmap VMAs that overlap a range.
+ * @mm: The mm struct
+ * @start: The start of the range.
+ * @len: The length of the range.
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
+ * @rb_link: the rb_node
+ * @rb_parent: the parent rb_node
+ *
+ * Find all the vm_area_struct that overlap from @start to
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ *
+ * Returns: -ENOMEM on munmap failure or 0 on success.
+ */
+static inline int
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
+ struct vm_area_struct **pprev, struct rb_node ***link,
+ struct rb_node **parent, struct list_head *uf)
+{
+
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ if (do_munmap(mm, start, len, uf))
+ return -ENOMEM;
+
+ return 0;
+}
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
+ }
+
+ return nr_pages;
+}
+
+void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct rb_node **rb_link, struct rb_node *rb_parent)
+{
+ /* Update tracking information for the gap following the new vma. */
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ mm->highest_vm_end = vm_end_gap(vma);
+
+ /*
+ * vma->vm_prev wasn't known when we followed the rbtree to find the
+ * correct insertion point for that vma. As a result, we could not
+ * update the vma vm_rb parents rb_subtree_gap values on the way down.
+ * So, we first insert the vma with a zero rb_subtree_gap value
+ * (to be consistent with what we did on the way down), and then
+ * immediately update the gap to the correct value. Finally we
+ * rebalance the rbtree after all augmented values have been set.
+ */
+ rb_link_node(&vma->vm_rb, rb_parent, rb_link);
+ vma->rb_subtree_gap = 0;
+ vma_gap_update(vma);
+ vma_rb_insert(vma, &mm->mm_rb);
+}
+
+static void __vma_link_file(struct vm_area_struct *vma)
+{
+ struct file *file;
+
+ file = vma->vm_file;
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ put_write_access(file_inode(file));
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ }
+}
+
+static void
+__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node **rb_link,
+ struct rb_node *rb_parent)
+{
+ __vma_link_list(mm, vma, prev);
+ __vma_link_rb(mm, vma, rb_link, rb_parent);
+}
+
+static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node **rb_link,
+ struct rb_node *rb_parent)
+{
+ struct address_space *mapping = NULL;
+
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
+
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ __vma_link_file(vma);
+
+ if (mapping)
+ i_mmap_unlock_write(mapping);
+
+ mm->map_count++;
+ validate_mm(mm);
+}
+
+/*
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree. It has already been inserted into the interval tree.
+ */
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
+
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ BUG();
+ __vma_link(mm, vma, prev, rb_link, rb_parent);
+ mm->map_count++;
+}
+
+static __always_inline void __vma_unlink(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct vm_area_struct *ignore)
+{
+ vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
+ __vma_unlink_list(mm, vma);
+ /* Kill the cache */
+ vmacache_invalidate(mm);
+}
+
+/*
+ * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
+ * is already present in an i_mmap tree without adjusting the tree.
+ * The following helper function should be used when such adjustments
+ * are necessary. The "insert" vma (if any) is to be inserted
+ * before we drop the necessary locks.
+ */
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+ struct vm_area_struct *expand)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = NULL;
+ struct file *file = vma->vm_file;
+ bool start_changed = false, end_changed = false;
+ long adjust_next = 0;
+ int remove_next = 0;
+
+ if (next && !insert) {
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
+
+ if (end >= next->vm_end) {
+ /*
+ * vma expands, overlapping all the next, and
+ * perhaps the one after too (mprotect case 6).
+ * The only other cases that gets here are
+ * case 1, case 7 and case 8.
+ */
+ if (next == expand) {
+ /*
+ * The only case where we don't expand "vma"
+ * and we expand "next" instead is case 8.
+ */
+ VM_WARN_ON(end != next->vm_end);
+ /*
+ * remove_next == 3 means we're
+ * removing "vma" and that to do so we
+ * swapped "vma" and "next".
+ */
+ remove_next = 3;
+ VM_WARN_ON(file != next->vm_file);
+ swap(vma, next);
+ } else {
+ VM_WARN_ON(expand != vma);
+ /*
+ * case 1, 6, 7, remove_next == 2 is case 6,
+ * remove_next == 1 is case 1 or 7.
+ */
+ remove_next = 1 + (end > next->vm_end);
+ VM_WARN_ON(remove_next == 2 &&
+ end != next->vm_next->vm_end);
+ /* trim end to next, for case 6 first pass */
+ end = next->vm_end;
+ }
+
+ exporter = next;
+ importer = vma;
+
+ /*
+ * If next doesn't have anon_vma, import from vma after
+ * next, if the vma overlaps with it.
+ */
+ if (remove_next == 2 && !next->anon_vma)
+ exporter = next->vm_next;
+
+ } else if (end > next->vm_start) {
+ /*
+ * vma expands, overlapping part of the next:
+ * mprotect case 5 shifting the boundary up.
+ */
+ adjust_next = (end - next->vm_start);
+ exporter = next;
+ importer = vma;
+ VM_WARN_ON(expand != importer);
+ } else if (end < vma->vm_end) {
+ /*
+ * vma shrinks, and !insert tells it's not
+ * split_vma inserting another: so it must be
+ * mprotect case 4 shifting the boundary down.
+ */
+ adjust_next = -(vma->vm_end - end);
+ exporter = vma;
+ importer = next;
+ VM_WARN_ON(expand != importer);
+ }
+
+ /*
+ * Easily overlooked: when mprotect shifts the boundary,
+ * make sure the expanding vma has anon_vma set if the
+ * shrinking vma had, to cover any anon pages imported.
+ */
+ if (exporter && exporter->anon_vma && !importer->anon_vma) {
+ int error;
+
+ importer->anon_vma = exporter->anon_vma;
+ error = anon_vma_clone(importer, exporter);
+ if (error)
+ return error;
+ }
+ }
+again:
+ vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+
+ if (file) {
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+
+ i_mmap_lock_write(mapping);
+ if (insert) {
+ /*
+ * Put into interval tree now, so instantiated pages
+ * are visible to arm/parisc __flush_dcache_page
+ * throughout; but we cannot insert into address
+ * space until vma start or end is updated.
+ */
+ __vma_link_file(insert);
+ }
+ }
+
+ anon_vma = vma->anon_vma;
+ if (!anon_vma && adjust_next)
+ anon_vma = next->anon_vma;
+ if (anon_vma) {
+ VM_WARN_ON(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma);
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_pre_update_vma(next);
+ }
+
+ if (file) {
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, root);
+ if (adjust_next)
+ vma_interval_tree_remove(next, root);
+ }
+
+ if (start != vma->vm_start) {
+ vma->vm_start = start;
+ start_changed = true;
+ }
+ if (end != vma->vm_end) {
+ vma->vm_end = end;
+ end_changed = true;
+ }
+ vma->vm_pgoff = pgoff;
+ if (adjust_next) {
+ next->vm_start += adjust_next;
+ next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ }
+
+ if (file) {
+ if (adjust_next)
+ vma_interval_tree_insert(next, root);
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
+
+ if (remove_next) {
+ /*
+ * vma_merge has merged next into vma, and needs
+ * us to remove next before dropping the locks.
+ */
+ if (remove_next != 3)
+ __vma_unlink(mm, next, next);
+ else
+ /*
+ * vma is not before next if they've been
+ * swapped.
+ *
+ * pre-swap() next->vm_start was reduced so
+ * tell validate_mm_rb to ignore pre-swap()
+ * "next" (which is stored in post-swap()
+ * "vma").
+ */
+ __vma_unlink(mm, next, vma);
+ if (file)
+ __remove_shared_vm_struct(next, file, mapping);
+ } else if (insert) {
+ /*
+ * split_vma has split insert from vma, and needs
+ * us to insert it before dropping the locks
+ * (it may either follow vma or precede it).
+ */
+ __insert_vm_struct(mm, insert);
+ } else {
+ if (start_changed)
+ vma_gap_update(vma);
+ if (end_changed) {
+ if (!next)
+ mm->highest_vm_end = vm_end_gap(vma);
+ else if (!adjust_next)
+ vma_gap_update(next);
+ }
+ }
+
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_post_update_vma(next);
+ anon_vma_unlock_write(anon_vma);
+ }
+
+ if (file) {
+ i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+
+ if (adjust_next)
+ uprobe_mmap(next);
+ }
+
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ /*
+ * In mprotect's case 6 (see comments on vma_merge),
+ * we must remove another next too. It would clutter
+ * up the code too much to do both in one go.
+ */
+ if (remove_next != 3) {
+ /*
+ * If "next" was removed and vma->vm_end was
+ * expanded (up) over it, in turn
+ * "next->vm_prev->vm_end" changed and the
+ * "vma->vm_next" gap must be updated.
+ */
+ next = vma->vm_next;
+ } else {
+ /*
+ * For the scope of the comment "next" and
+ * "vma" considered pre-swap(): if "vma" was
+ * removed, next->vm_start was expanded (down)
+ * over it and the "next" gap must be updated.
+ * Because of the swap() the post-swap() "vma"
+ * actually points to pre-swap() "next"
+ * (post-swap() "next" as opposed is now a
+ * dangling pointer).
+ */
+ next = vma;
+ }
+ if (remove_next == 2) {
+ remove_next = 1;
+ end = next->vm_end;
+ goto again;
+ }
+ else if (next)
+ vma_gap_update(next);
+ else {
+ /*
+ * If remove_next == 2 we obviously can't
+ * reach this path.
+ *
+ * If remove_next == 3 we can't reach this
+ * path because pre-swap() next is always not
+ * NULL. pre-swap() "next" is not being
+ * removed and its next->vm_end is not altered
+ * (and furthermore "end" already matches
+ * next->vm_end in remove_next == 3).
+ *
+ * We reach this only in the remove_next == 1
+ * case if the "next" vma that was removed was
+ * the highest vma of the mm. However in such
+ * case next->vm_end == "end" and the extended
+ * "vma" has vma->vm_end == next->vm_end so
+ * mm->highest_vm_end doesn't need any update
+ * in remove_next == 1 case.
+ */
+ VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
+ }
+ }
+ if (insert && file)
+ uprobe_mmap(insert);
+
+ validate_mm(mm);
+
+ return 0;
+}
+
+/*
+ * If the vma has a ->close operation then the driver probably needs to release
+ * per-vma resources, so we don't attempt to merge those.
+ */
+static inline int is_mergeable_vma(struct vm_area_struct *vma,
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+{
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressure on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
+ return 0;
+ if (vma->vm_file != file)
+ return 0;
+ if (vma->vm_ops && vma->vm_ops->close)
+ return 0;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ return 0;
+ return 1;
+}
+
+static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ struct anon_vma *anon_vma2,
+ struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ if ((!anon_vma1 || !anon_vma2) && (!vma ||
+ list_is_singular(&vma->anon_vma_chain)))
+ return 1;
+ return anon_vma1 == anon_vma2;
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * in front of (at a lower virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We don't check here for the merged mmap wrapping around the end of pagecache
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
+ * wrap, nor mmaps which cover the final page at index -1UL.
+ */
+static int
+can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+{
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+ if (vma->vm_pgoff == vm_pgoff)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * beyond (at a higher virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ */
+static int
+can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+{
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+ pgoff_t vm_pglen;
+ vm_pglen = vma_pages(vma);
+ if (vma->vm_pgoff + vm_pglen == vm_pgoff)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
+ * whether that can be merged with its predecessor or its successor.
+ * Or both (it neatly fills a hole).
+ *
+ * In most cases - when called for mmap, brk or mremap - [addr,end) is
+ * certain not to be mapped by the time vma_merge is called; but when
+ * called for mprotect, it is certain to be already mapped (either at
+ * an offset within prev, or at the start of next), and the flags of
+ * this area are about to be changed to vm_flags - and the no-change
+ * case has already been eliminated.
+ *
+ * The following mprotect cases have to be considered, where AAAA is
+ * the area passed down from mprotect_fixup, never extending beyond one
+ * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
+ *
+ * AAAA AAAA AAAA
+ * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
+ * cannot merge might become might become
+ * PPNNNNNNNNNN PPPPPPPPPPNN
+ * mmap, brk or case 4 below case 5 below
+ * mremap move:
+ * AAAA AAAA
+ * PPPP NNNN PPPPNNNNXXXX
+ * might become might become
+ * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
+ * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
+ * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
+ *
+ * It is important for case 8 that the vma NNNN overlapping the
+ * region AAAA is never going to extended over XXXX. Instead XXXX must
+ * be extended in region AAAA and NNNN must be removed. This way in
+ * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * rmap_locks, the properties of the merged vma will be already
+ * correct for the whole merged range. Some of those properties like
+ * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
+ * be correct for the whole merged range immediately after the
+ * rmap_locks are released. Otherwise if XXXX would be removed and
+ * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * or other rmap walkers (if working on addresses beyond the "end"
+ * parameter) may establish ptes with the wrong permissions of NNNN
+ * instead of the right permissions of XXXX.
+ */
+struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ struct vm_area_struct *prev, unsigned long addr,
+ unsigned long end, unsigned long vm_flags,
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+{
+ pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+ struct vm_area_struct *area, *next;
+ int err;
+
+ /*
+ * We later require that vma->vm_flags == vm_flags,
+ * so this tests vma->vm_flags & VM_SPECIAL, too.
+ */
+ if (vm_flags & VM_SPECIAL)
+ return NULL;
+
+ next = vma_next(mm, prev);
+ area = next;
+ if (area && area->vm_end == end) /* cases 6, 7, 8 */
+ next = next->vm_next;
+
+ /* verify some invariant that must be enforced by the caller */
+ VM_WARN_ON(prev && addr <= prev->vm_start);
+ VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(addr >= end);
+
+ /*
+ * Can it merge with the predecessor?
+ */
+ if (prev && prev->vm_end == addr &&
+ mpol_equal(vma_policy(prev), policy) &&
+ can_vma_merge_after(prev, vm_flags,
+ anon_vma, file, pgoff,
+ vm_userfaultfd_ctx)) {
+ /*
+ * OK, it can. Can we now merge in the successor as well?
+ */
+ if (next && end == next->vm_start &&
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags,
+ anon_vma, file,
+ pgoff+pglen,
+ vm_userfaultfd_ctx) &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL)) {
+ /* cases 1, 6 */
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
+ } else /* cases 2, 5, 7 */
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(prev, vm_flags);
+ return prev;
+ }
+
+ /*
+ * Can this new request be merged in front of next?
+ */
+ if (next && end == next->vm_start &&
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags,
+ anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx)) {
+ if (prev && addr < prev->vm_end) /* case 4 */
+ err = __vma_adjust(prev, prev->vm_start,
+ addr, prev->vm_pgoff, NULL, next);
+ else { /* cases 3, 8 */
+ err = __vma_adjust(area, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ /*
+ * In case 3 area is already equal to next and
+ * this is a noop, but in case 8 "area" has
+ * been removed and next was expanded over it.
+ */
+ area = next;
+ }
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(area, vm_flags);
+ return area;
+ }
+
+ return NULL;
+}
+
+/*
+ * Rough compatibility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ return a->vm_end == b->vm_start &&
+ mpol_equal(vma_policy(a), vma_policy(b)) &&
+ a->vm_file == b->vm_file &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+ b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mm_sem.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ if (anon_vma_compatible(a, b)) {
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
+
+ if (anon_vma && list_is_singular(&old->anon_vma_chain))
+ return anon_vma;
+ }
+ return NULL;
+}
+
+/*
+ * find_mergeable_anon_vma is used by anon_vma_prepare, to check
+ * neighbouring vmas for a suitable anon_vma, before it goes off
+ * to allocate a new anon_vma. It checks because a repetitive
+ * sequence of mprotects and faults may otherwise lead to distinct
+ * anon_vmas being allocated, preventing vma merge in subsequent
+ * mprotect.
+ */
+struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = NULL;
+
+ /* Try next first. */
+ if (vma->vm_next) {
+ anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ if (anon_vma)
+ return anon_vma;
+ }
+
+ /* Try prev next. */
+ if (vma->vm_prev)
+ anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+
+ /*
+ * We might reach here with anon_vma == NULL if we can't find
+ * any reusable anon_vma.
+ * There's no absolute need to look only at touching neighbours:
+ * we could search further afield for "compatible" anon_vmas.
+ * But it would probably just be a waste of time searching,
+ * or lead to too many vmas hanging off the same anon_vma.
+ * We're trying to allow mprotect remerging later on,
+ * not trying to minimize memory used for anon_vmas.
+ */
+ return anon_vma;
+}
+
+/*
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+ hint &= PAGE_MASK;
+ if (((void *)hint != NULL) &&
+ (hint < mmap_min_addr))
+ return PAGE_ALIGN(mmap_min_addr);
+ return hint;
+}
+
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ if (S_ISBLK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
+ /* Special "we do even unsigned file positions" case */
+ if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ return 0;
+
+ /* Yes, random drivers might want more. But I'm tired of buggy drivers */
+ return ULONG_MAX;
+}
+
+static inline bool file_mmap_ok(struct file *file, struct inode *inode,
+ unsigned long pgoff, unsigned long len)
+{
+ u64 maxsize = file_mmap_size_max(file, inode);
+
+ if (maxsize && len > maxsize)
+ return false;
+ maxsize -= len;
+ if (pgoff > maxsize >> PAGE_SHIFT)
+ return false;
+ return true;
+}
+
+/*
+ * The caller must write-lock current->mm->mmap_lock.
+ */
+unsigned long do_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate, struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
+ int pkey = 0;
+
+ *populate = 0;
+
+ if (!len)
+ return -EINVAL;
+
+ /*
+ * Does the application expect PROT_READ to imply PROT_EXEC?
+ *
+ * (the exception is when the underlying filesystem is noexec
+ * mounted, in which case we dont add PROT_EXEC.)
+ */
+ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+ if (!(file && path_noexec(&file->f_path)))
+ prot |= PROT_EXEC;
+
+ /* force arch specific MAP_FIXED handling in get_unmapped_area */
+ if (flags & MAP_FIXED_NOREPLACE)
+ flags |= MAP_FIXED;
+
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
+
+ /* Careful about overflows.. */
+ len = PAGE_ALIGN(len);
+ if (!len)
+ return -ENOMEM;
+
+ /* offset overflow? */
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EOVERFLOW;
+
+ /* Too many mappings? */
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ /* Obtain the address to map to. we verify (or select) it and ensure
+ * that it represents a valid section of the address space.
+ */
+ addr = get_unmapped_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (flags & MAP_FIXED_NOREPLACE) {
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (vma && vma->vm_start < addr + len)
+ return -EEXIST;
+ }
+
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(mm);
+ if (pkey < 0)
+ pkey = 0;
+ }
+
+ /* Do simple checking here so the lower-level routines won't have
+ * to. we assume access permissions have been handled by the open
+ * of the memory object, so we don't do any here.
+ */
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (flags & MAP_LOCKED)
+ if (!can_do_mlock())
+ return -EPERM;
+
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
+
+ if (file) {
+ struct inode *inode = file_inode(file);
+ unsigned long flags_mask;
+
+ if (!file_mmap_ok(file, inode, pgoff, len))
+ return -EOVERFLOW;
+
+ flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
+
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ /*
+ * Force use of MAP_SHARED_VALIDATE with non-legacy
+ * flags. E.g. MAP_SYNC is dangerous to use with
+ * MAP_SHARED as you don't know which consistency model
+ * you will get. We silently ignore unsupported flags
+ * with MAP_SHARED to preserve backward compatibility.
+ */
+ flags &= LEGACY_MAP_MASK;
+ fallthrough;
+ case MAP_SHARED_VALIDATE:
+ if (flags & ~flags_mask)
+ return -EOPNOTSUPP;
+ if (prot & PROT_WRITE) {
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+ if (IS_SWAPFILE(file->f_mapping->host))
+ return -ETXTBSY;
+ }
+
+ /*
+ * Make sure we don't allow writing to an append-only
+ * file..
+ */
+ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ /*
+ * Make sure there are no mandatory locks on the file.
+ */
+ if (locks_verify_locked(file))
+ return -EAGAIN;
+
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+ fallthrough;
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+ if (path_noexec(&file->f_path)) {
+ if (vm_flags & VM_EXEC)
+ return -EPERM;
+ vm_flags &= ~VM_MAYEXEC;
+ }
+
+ if (!file->f_op->mmap)
+ return -ENODEV;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ break;
+ case MAP_PRIVATE:
+ /*
+ * Set pgoff according to addr for anon_vma.
+ */
+ pgoff = addr >> PAGE_SHIFT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if (flags & MAP_NORESERVE) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
+
+ addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
+ if (!IS_ERR_VALUE(addr) &&
+ ((vm_flags & VM_LOCKED) ||
+ (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
+ *populate = len;
+ return addr;
+}
+
+unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ audit_mmap_fd(fd, flags);
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+ if (is_file_hugepages(file)) {
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
+ } else if (unlikely(flags & MAP_HUGETLB)) {
+ retval = -EINVAL;
+ goto out_fput;
+ }
+ } else if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ struct hstate *hs;
+
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+ if (!hs)
+ return -EINVAL;
+
+ len = ALIGN(len, huge_page_size(hs));
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
+ VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE,
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
+ if (file)
+ fput(file);
+ return retval;
+}
+
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (offset_in_page(a.offset))
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
+/*
+ * Some shared mappings will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+ const struct vm_operations_struct *vm_ops = vma->vm_ops;
+
+ /* If it was private or non-writable, the write bit is already clear */
+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ return 0;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
+ return 1;
+
+ /* The open routine did something to the protections that pgprot_modify
+ * won't preserve? */
+ if (pgprot_val(vm_page_prot) !=
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
+ return 0;
+
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
+ !is_vm_hugetlb_page(vma))
+ return 1;
+
+ /* Specialty mapping? */
+ if (vm_flags & VM_PFNMAP)
+ return 0;
+
+ /* Can the mapping track the dirty pages? */
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev, *merge;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ /*
+ * Can we just expand an old mapping?
+ */
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (vma)
+ goto out;
+
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_DENYWRITE) {
+ error = deny_write_access(file);
+ if (error)
+ goto free_vma;
+ }
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto allow_write_and_free_vma;
+ }
+
+ /* ->mmap() can change vma->vm_file, but must guarantee that
+ * vma_link() below can deny write-access if VM_DENYWRITE is set
+ * and map writably if VM_SHARED is set. This usually means the
+ * new file must not have been exposed to user-space, yet.
+ */
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ * Bug: If addr is changed, prev, rb_link, rb_parent should
+ * be updated for vma_link()
+ */
+ WARN_ON_ONCE(addr != vma->vm_start);
+
+ addr = vma->vm_start;
+
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
+ * as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (merge) {
+ /* ->mmap() can change vma->vm_file and fput the original file. So
+ * fput the vma->vm_file here or we would add an extra fput for file
+ * and cause general protection fault ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto close_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ /* Once vma denies write, undo our temporary denial count */
+ if (file) {
+unmap_writable:
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ }
+ file = vma->vm_file;
+out:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
+ return addr;
+
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+unmap_and_free_vma:
+ vma->vm_file = NULL;
+ fput(file);
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+ if (vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+allow_write_and_free_vma:
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+}
+
+static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ /*
+ * We implement the search by looking for an rbtree node that
+ * immediately follows a suitable gap. That is,
+ * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+ * - gap_end = vma->vm_start >= info->low_limit + length;
+ * - gap_end - gap_start >= length
+ */
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /* Adjust search limits by the desired length */
+ if (info->high_limit < length)
+ return -ENOMEM;
+ high_limit = info->high_limit - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ goto check_highest;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ goto check_highest;
+
+ while (true) {
+ /* Visit left subtree if it looks promising */
+ gap_end = vm_start_gap(vma);
+ if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+check_current:
+ /* Check if current node has a suitable gap */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit right subtree if it looks promising */
+ if (vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ goto check_highest;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_left) {
+ gap_start = vm_end_gap(vma->vm_prev);
+ gap_end = vm_start_gap(vma);
+ goto check_current;
+ }
+ }
+ }
+
+check_highest:
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+
+found:
+ /* We found a suitable gap. Clip it with the original low_limit. */
+ if (gap_start < info->low_limit)
+ gap_start = info->low_limit;
+
+ /* Adjust gap address to the desired alignment */
+ gap_start += (info->align_offset - gap_start) & info->align_mask;
+
+ VM_BUG_ON(gap_start + info->length > info->high_limit);
+ VM_BUG_ON(gap_start + info->length > gap_end);
+ return gap_start;
+}
+
+static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /*
+ * Adjust search limits by the desired length.
+ * See implementation comment at top of unmapped_area().
+ */
+ gap_end = info->high_limit;
+ if (gap_end < length)
+ return -ENOMEM;
+ high_limit = gap_end - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ if (gap_start <= high_limit)
+ goto found_highest;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ return -ENOMEM;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ return -ENOMEM;
+
+ while (true) {
+ /* Visit right subtree if it looks promising */
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+check_current:
+ /* Check if current node has a suitable gap */
+ gap_end = vm_start_gap(vma);
+ if (gap_end < low_limit)
+ return -ENOMEM;
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit left subtree if it looks promising */
+ if (vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ return -ENOMEM;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_right) {
+ gap_start = vma->vm_prev ?
+ vm_end_gap(vma->vm_prev) : 0;
+ goto check_current;
+ }
+ }
+ }
+
+found:
+ /* We found a suitable gap. Clip it with the original high_limit. */
+ if (gap_end > info->high_limit)
+ gap_end = info->high_limit;
+
+found_highest:
+ /* Compute highest gap address at the desired alignment */
+ gap_end -= info->length;
+ gap_end -= (gap_end - info->align_offset) & info->align_mask;
+
+ VM_BUG_ON(gap_end < info->low_limit);
+ VM_BUG_ON(gap_end < gap_start);
+ return gap_end;
+}
+
+/*
+ * Search for an unmapped address range.
+ *
+ * We are looking for a range that:
+ * - does not intersect with any VMA;
+ * - is contained within the [low_limit, high_limit) interval;
+ * - is at least the desired size.
+ * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
+ */
+unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long addr;
+
+ if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
+ addr = unmapped_area_topdown(info);
+ else
+ addr = unmapped_area(info);
+
+ trace_vm_unmapped_area(addr, info);
+ return addr;
+}
+
+/* Get an address range which is currently unmapped.
+ * For shmat() with addr=0.
+ *
+ * Ugly calling convention alert:
+ * Return value with the low bits set means error value,
+ * ie
+ * if (ret & ~PAGE_MASK)
+ * error = ret;
+ *
+ * This function "knows" that -ENOMEM has the bits set.
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
+
+ if (len > mmap_end - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma_prev(mm, addr, &prev);
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
+ return addr;
+ }
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = mm->mmap_base;
+ info.high_limit = mmap_end;
+ info.align_mask = 0;
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+#endif
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ */
+#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct vm_area_struct *vma, *prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
+
+ /* requested length too big for entire address space */
+ if (len > mmap_end - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma_prev(mm, addr, &prev);
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
+ return addr;
+ }
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
+ info.align_mask = 0;
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (offset_in_page(addr)) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = mmap_end;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+#endif
+
+unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+ unsigned long error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
+ /* Careful about overflows.. */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ get_area = current->mm->get_unmapped_area;
+ if (file) {
+ if (file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ } else if (flags & MAP_SHARED) {
+ /*
+ * mmap_region() will call shmem_zero_setup() to create a file,
+ * so use shmem's get_unmapped_area in case it can be huge.
+ * do_mmap() will clear pgoff, so match alignment.
+ */
+ pgoff = 0;
+ get_area = shmem_get_unmapped_area;
+ }
+
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (offset_in_page(addr))
+ return -EINVAL;
+
+ error = security_mmap_addr(addr);
+ return error ? error : addr;
+}
+
+EXPORT_SYMBOL(get_unmapped_area);
+
+/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ rb_node = mm->mm_rb.rb_node;
+
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
+ }
+
+ if (vma)
+ vmacache_update(addr, vma);
+ return vma;
+}
+
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ */
+struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **pprev)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, addr);
+ if (vma) {
+ *pprev = vma->vm_prev;
+ } else {
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
+ }
+ return vma;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long new_start;
+
+ /* address space limit tests */
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
+ return -ENOMEM;
+
+ /* Stack limit test */
+ if (size > rlimit(RLIMIT_STACK))
+ return -ENOMEM;
+
+ /* mlock limit tests */
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked;
+ unsigned long limit;
+ locked = mm->locked_vm + grow;
+ limit = rlimit(RLIMIT_MEMLOCK);
+ limit >>= PAGE_SHIFT;
+ if (locked > limit && !capable(CAP_IPC_LOCK))
+ return -ENOMEM;
+ }
+
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory_mm(mm, grow))
+ return -ENOMEM;
+
+ return 0;
+}
+
+#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+/*
+ * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
+ int error = 0;
+
+ if (!(vma->vm_flags & VM_GROWSUP))
+ return -EFAULT;
+
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= (TASK_SIZE & PAGE_MASK))
+ return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = vma->vm_next;
+ if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
+
+ /*
+ * vma->vm_start/vm_end cannot change under us because the caller
+ * is required to hold the mmap_lock in read mode. We need the
+ * anon_vma lock to serialize against concurrent expand_stacks.
+ */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address > vma->vm_end) {
+ unsigned long size, grow;
+
+ size = address - vma->vm_start;
+ grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_lock
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * anon_vma_lock_write() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_end = address;
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ mm->highest_vm_end = vm_end_gap(vma);
+ spin_unlock(&mm->page_table_lock);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ validate_mm(mm);
+ return error;
+}
+#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+
+/*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ */
+int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ int error = 0;
+
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
+ prev = vma->vm_prev;
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev)) {
+ if (address - prev->vm_end < stack_guard_gap)
+ return -ENOMEM;
+ }
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
+
+ /*
+ * vma->vm_start/vm_end cannot change under us because the caller
+ * is required to hold the mmap_lock in read mode. We need the
+ * anon_vma lock to serialize against concurrent expand_stacks.
+ */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address < vma->vm_start) {
+ unsigned long size, grow;
+
+ size = vma->vm_end - address;
+ grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_lock
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * anon_vma_lock_write() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&mm->page_table_lock);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ anon_vma_interval_tree_post_update_vma(vma);
+ vma_gap_update(vma);
+ spin_unlock(&mm->page_table_lock);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ validate_mm(mm);
+ return error;
+}
+
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(p, &endptr, 10);
+ if (!*endptr)
+ stack_guard_gap = val << PAGE_SHIFT;
+
+ return 1;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return expand_downwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ unsigned long start;
+
+ addr &= PAGE_MASK;
+ vma = find_vma(mm, addr);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start <= addr)
+ return vma;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return NULL;
+ start = vma->vm_start;
+ if (expand_stack(vma, addr))
+ return NULL;
+ if (vma->vm_flags & VM_LOCKED)
+ populate_vma_page_range(vma, addr, start, NULL);
+ return vma;
+}
+#endif
+
+EXPORT_SYMBOL_GPL(find_extend_vma);
+
+/*
+ * Ok - we have the memory areas we should free on the vma list,
+ * so release them, and do the vma updates.
+ *
+ * Called with the mm semaphore held.
+ */
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long nr_accounted = 0;
+
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
+ do {
+ long nrpages = vma_pages(vma);
+
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += nrpages;
+ vm_stat_account(mm, vma->vm_flags, -nrpages);
+ vma = remove_vma(vma);
+ } while (vma);
+ vm_unacct_memory(nr_accounted);
+ validate_mm(mm);
+}
+
+/*
+ * Get rid of page table information in the indicated region.
+ *
+ * Called with the mm semaphore held.
+ */
+static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *next = vma_next(mm, prev);
+ struct mmu_gather tlb;
+ struct vm_area_struct *cur_vma;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
+ if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
+ tlb_flush_mmu(&tlb);
+ break;
+ }
+ }
+
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, start, end);
+}
+
+/*
+ * Create a list of vma's touched by the unmap, removing them from the mm's
+ * vma list as we go..
+ */
+static bool
+detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, unsigned long end)
+{
+ struct vm_area_struct **insertion_point;
+ struct vm_area_struct *tail_vma = NULL;
+
+ insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
+ do {
+ vma_rb_erase(vma, &mm->mm_rb);
+ mm->map_count--;
+ tail_vma = vma;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+ *insertion_point = vma;
+ if (vma) {
+ vma->vm_prev = prev;
+ vma_gap_update(vma);
+ } else
+ mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
+ tail_vma->vm_next = NULL;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
+}
+
+/*
+ * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
+ * has already been checked or doesn't make sense to fail.
+ */
+int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ struct vm_area_struct *new;
+ int err;
+
+ if (vma->vm_ops && vma->vm_ops->split) {
+ err = vma->vm_ops->split(vma, addr);
+ if (err)
+ return err;
+ }
+
+ new = vm_area_dup(vma);
+ if (!new)
+ return -ENOMEM;
+
+ if (new_below)
+ new->vm_end = addr;
+ else {
+ new->vm_start = addr;
+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ }
+
+ err = vma_dup_policy(vma, new);
+ if (err)
+ goto out_free_vma;
+
+ err = anon_vma_clone(new, vma);
+ if (err)
+ goto out_free_mpol;
+
+ if (new->vm_file)
+ get_file(new->vm_file);
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+ if (new_below)
+ err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ ((addr - new->vm_start) >> PAGE_SHIFT), new);
+ else
+ err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
+ /* Success. */
+ if (!err)
+ return 0;
+
+ /* Clean everything up if vma_adjust failed. */
+ if (new->vm_ops && new->vm_ops->close)
+ new->vm_ops->close(new);
+ if (new->vm_file)
+ fput(new->vm_file);
+ unlink_anon_vmas(new);
+ out_free_mpol:
+ mpol_put(vma_policy(new));
+ out_free_vma:
+ vm_area_free(new);
+ return err;
+}
+
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ return __split_vma(mm, vma, addr, new_below);
+}
+
+/* Munmap is split into 2 main parts -- this part which finds
+ * what needs doing, and the areas themselves, which do the
+ * work. This now handles partial unmappings.
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf, bool downgrade)
+{
+ unsigned long end;
+ struct vm_area_struct *vma, *prev, *last;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ len = PAGE_ALIGN(len);
+ end = start + len;
+ if (len == 0)
+ return -EINVAL;
+
+ /*
+ * arch_unmap() might do unmaps itself. It must be called
+ * and finish any rbtree manipulation before this code
+ * runs and also starts to manipulate the rbtree.
+ */
+ arch_unmap(mm, start, end);
+
+ /* Find the first overlapping VMA */
+ vma = find_vma(mm, start);
+ if (!vma)
+ return 0;
+ prev = vma->vm_prev;
+ /* we have start < vma->vm_end */
+
+ /* if it doesn't overlap, we have nothing.. */
+ if (vma->vm_start >= end)
+ return 0;
+
+ /*
+ * If we need to split any vma, do it now to save pain later.
+ *
+ * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
+ * unmapped vm_area_struct will remain in use: so lower split_vma
+ * places tmp vma above, and higher split_vma places tmp vma below.
+ */
+ if (start > vma->vm_start) {
+ int error;
+
+ /*
+ * Make sure that map_count on return from munmap() will
+ * not exceed its limit; but let map_count go just above
+ * its limit temporarily, to help free resources as expected.
+ */
+ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ error = __split_vma(mm, vma, start, 0);
+ if (error)
+ return error;
+ prev = vma;
+ }
+
+ /* Does it split the last one? */
+ last = find_vma(mm, end);
+ if (last && end > last->vm_start) {
+ int error = __split_vma(mm, last, end, 1);
+ if (error)
+ return error;
+ }
+ vma = vma_next(mm, prev);
+
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain splitted, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ if (error)
+ return error;
+ }
+
+ /*
+ * unlock any mlock()ed ranges before detaching vmas
+ */
+ if (mm->locked_vm) {
+ struct vm_area_struct *tmp = vma;
+ while (tmp && tmp->vm_start < end) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+
+ tmp = tmp->vm_next;
+ }
+ }
+
+ /* Detach vmas from rbtree */
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
+
+ if (downgrade)
+ mmap_write_downgrade(mm);
+
+ unmap_region(mm, vma, prev, start, end);
+
+ /* Fix up all other VM information */
+ remove_vma_list(mm, vma);
+
+ return downgrade ? 1 : 0;
+}
+
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
+{
+ return __do_munmap(mm, start, len, uf, false);
+}
+
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ LIST_HEAD(uf);
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
+ /*
+ * Returning 1 indicates mmap_lock is downgraded.
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
+ * it to 0 before return.
+ */
+ if (ret == 1) {
+ mmap_read_unlock(mm);
+ ret = 0;
+ } else
+ mmap_write_unlock(mm);
+
+ userfaultfd_unmap_complete(mm, &uf);
+ return ret;
+}
+
+int vm_munmap(unsigned long start, size_t len)
+{
+ return __vm_munmap(start, len, false);
+}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ addr = untagged_addr(addr);
+ profile_munmap(addr);
+ return __vm_munmap(addr, len, true);
+}
+
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long populate = 0;
+ unsigned long ret = -EINVAL;
+ struct file *file;
+
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
+ current->comm, current->pid);
+
+ if (prot)
+ return ret;
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ if (start + size <= start)
+ return ret;
+
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return ret;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ vma = find_vma(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (start < vma->vm_start)
+ goto out;
+
+ if (start + size > vma->vm_end) {
+ struct vm_area_struct *next;
+
+ for (next = vma->vm_next; next; next = next->vm_next) {
+ /* hole between vmas ? */
+ if (next->vm_start != next->vm_prev->vm_end)
+ goto out;
+
+ if (next->vm_file != vma->vm_file)
+ goto out;
+
+ if (next->vm_flags != vma->vm_flags)
+ goto out;
+
+ if (start + size <= next->vm_end)
+ break;
+ }
+
+ if (!next)
+ goto out;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED) {
+ struct vm_area_struct *tmp;
+ flags |= MAP_LOCKED;
+
+ /* drop PG_Mlocked flag for over-mapped range */
+ for (tmp = vma; tmp->vm_start >= start + size;
+ tmp = tmp->vm_next) {
+ /*
+ * Split pmd and munlock page on the border
+ * of the range.
+ */
+ vma_adjust_trans_huge(tmp, start, start + size, 0);
+
+ munlock_vma_pages_range(tmp,
+ max(tmp->vm_start, start),
+ min(tmp->vm_end, start + size));
+ }
+ }
+
+ file = get_file(vma->vm_file);
+ ret = do_mmap(vma->vm_file, start, size,
+ prot, flags, pgoff, &populate, NULL);
+ fput(file);
+out:
+ mmap_write_unlock(mm);
+ if (populate)
+ mm_populate(ret, populate);
+ if (!IS_ERR_VALUE(ret))
+ ret = 0;
+ return ret;
+}
+
+/*
+ * this is really a simplified "do_mmap". it only handles
+ * anonymous maps. eventually we may be able to do some
+ * brk-specific accounting here.
+ */
+static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
+ pgoff_t pgoff = addr >> PAGE_SHIFT;
+ int error;
+ unsigned long mapped_addr;
+
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
+
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
+
+ /* Check against address space limits *after* clearing old maps... */
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /* Can we just expand an old private anonymous mapping? */
+ vma = vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (vma)
+ goto out;
+
+ /*
+ * create a vma struct for an anonymous mapping
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+ }
+
+ vma_set_anonymous(vma);
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_pgoff = pgoff;
+ vma->vm_flags = flags;
+ vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+out:
+ perf_event_mmap(vma);
+ mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
+ if (flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vma->vm_flags |= VM_SOFTDIRTY;
+ return 0;
+}
+
+int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long len;
+ int ret;
+ bool populate;
+ LIST_HEAD(uf);
+
+ len = PAGE_ALIGN(request);
+ if (len < request)
+ return -ENOMEM;
+ if (!len)
+ return 0;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ret = do_brk_flags(addr, len, flags, &uf);
+ populate = ((mm->def_flags & VM_LOCKED) != 0);
+ mmap_write_unlock(mm);
+ userfaultfd_unmap_complete(mm, &uf);
+ if (populate && !ret)
+ mm_populate(addr, len);
+ return ret;
+}
+EXPORT_SYMBOL(vm_brk_flags);
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+ return vm_brk_flags(addr, len, 0);
+}
+EXPORT_SYMBOL(vm_brk);
+
+/* Release all mmaps. */
+void exit_mmap(struct mm_struct *mm)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ unsigned long nr_accounted = 0;
+
+ /* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
+ if (unlikely(mm_is_oom_victim(mm))) {
+ /*
+ * Manually reap the mm to free as much memory as possible.
+ * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
+ * this mm from further consideration. Taking mm->mmap_lock for
+ * write after setting MMF_OOM_SKIP will guarantee that the oom
+ * reaper will not run on this mm again after mmap_lock is
+ * dropped.
+ *
+ * Nothing can be holding mm->mmap_lock here and the above call
+ * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
+ * __oom_reap_task_mm() will not block.
+ *
+ * This needs to be done before calling munlock_vma_pages_all(),
+ * which clears VM_LOCKED, otherwise the oom reaper cannot
+ * reliably test it.
+ */
+ (void)__oom_reap_task_mm(mm);
+
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
+ }
+
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ vma = vma->vm_next;
+ }
+ }
+
+ arch_exit_mmap(mm);
+
+ vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
+
+ lru_add_drain();
+ flush_cache_mm(mm);
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
+ /* Use -1 here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, vma, 0, -1);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, 0, -1);
+
+ /*
+ * Walk the list again, actually closing and freeing it,
+ * with preemption enabled, without holding any MM locks.
+ */
+ while (vma) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += vma_pages(vma);
+ vma = remove_vma(vma);
+ cond_resched();
+ }
+ vm_unacct_memory(nr_accounted);
+}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
+
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ return 0;
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks)
+{
+ struct vm_area_struct *vma = *vmap;
+ unsigned long vma_start = vma->vm_start;
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
+ bool faulted_in_anon_vma = true;
+
+ /*
+ * If anonymous vma has not yet been faulted, update new pgoff
+ * to match new location, to increase its chance of merging.
+ */
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
+ pgoff = addr >> PAGE_SHIFT;
+ faulted_in_anon_vma = false;
+ }
+
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ return NULL; /* should never get here */
+ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
+ if (new_vma) {
+ /*
+ * Source vma may have been merged into new_vma
+ */
+ if (unlikely(vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)) {
+ /*
+ * The only way we can get a vma_merge with
+ * self during an mremap is if the vma hasn't
+ * been faulted in yet and we were allowed to
+ * reset the dst vma->vm_pgoff to the
+ * destination address of the mremap to allow
+ * the merge to happen. mremap must change the
+ * vm_pgoff linearity between src and dst vmas
+ * (in turn preventing a vma_merge) to be
+ * safe. It is only safe to keep the vm_pgoff
+ * linear if there are no pages mapped yet.
+ */
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
+ *vmap = vma = new_vma;
+ }
+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
+ } else {
+ new_vma = vm_area_dup(vma);
+ if (!new_vma)
+ goto out;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
+ }
+ return new_vma;
+
+out_free_mempol:
+ mpol_put(vma_policy(new_vma));
+out_free_vma:
+ vm_area_free(new_vma);
+out:
+ return NULL;
+}
+
+/*
+ * Return true if the calling process may expand its vm space by the passed
+ * number of pages
+ */
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
+{
+ if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+ return false;
+
+ if (is_data_mapping(flags) &&
+ mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
+ /* Workaround for Valgrind */
+ if (rlimit(RLIMIT_DATA) == 0 &&
+ mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
+ return true;
+
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
+ current->comm, current->pid,
+ (mm->data_vm + npages) << PAGE_SHIFT,
+ rlimit(RLIMIT_DATA),
+ ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
+
+ if (!ignore_rlimit_data)
+ return false;
+ }
+
+ return true;
+}
+
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+ mm->total_vm += npages;
+
+ if (is_exec_mapping(flags))
+ mm->exec_vm += npages;
+ else if (is_stack_mapping(flags))
+ mm->stack_vm += npages;
+ else if (is_data_mapping(flags))
+ mm->data_vm += npages;
+}
+
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const char *special_mapping_name(struct vm_area_struct *vma)
+{
+ return ((struct vm_special_mapping *)vma->vm_private_data)->name;
+}
+
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
+{
+ struct vm_special_mapping *sm = new_vma->vm_private_data;
+
+ if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
+ return -EFAULT;
+
+ if (sm->mremap)
+ return sm->mremap(sm, new_vma);
+
+ return 0;
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+ .mremap = special_mapping_mremap,
+ .name = special_mapping_name,
+ /* vDSO code relies that VVAR can't be accessed remotely */
+ .access = NULL,
+};
+
+static const struct vm_operations_struct legacy_special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
+
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ pgoff_t pgoff;
+ struct page **pages;
+
+ if (vma->vm_ops == &legacy_special_mapping_vmops) {
+ pages = vma->vm_private_data;
+ } else {
+ struct vm_special_mapping *sm = vma->vm_private_data;
+
+ if (sm->fault)
+ return sm->fault(sm, vmf->vma, vmf);
+
+ pages = sm->pages;
+ }
+
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
+ pgoff--;
+
+ if (*pages) {
+ struct page *page = *pages;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_area_struct *__install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, void *priv,
+ const struct vm_operations_struct *ops)
+{
+ int ret;
+ struct vm_area_struct *vma;
+
+ vma = vm_area_alloc(mm);
+ if (unlikely(vma == NULL))
+ return ERR_PTR(-ENOMEM);
+
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ vma->vm_ops = ops;
+ vma->vm_private_data = priv;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
+
+ vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
+
+ perf_event_mmap(vma);
+
+ return vma;
+
+out:
+ vm_area_free(vma);
+ return ERR_PTR(ret);
+}
+
+bool vma_is_special_mapping(const struct vm_area_struct *vma,
+ const struct vm_special_mapping *sm)
+{
+ return vma->vm_private_data == sm &&
+ (vma->vm_ops == &special_mapping_vmops ||
+ vma->vm_ops == &legacy_special_mapping_vmops);
+}
+
+/*
+ * Called with mm->mmap_lock held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+struct vm_area_struct *_install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_special_mapping *spec)
+{
+ return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+ &special_mapping_vmops);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma = __install_special_mapping(
+ mm, addr, len, vm_flags, (void *)pages,
+ &legacy_special_mapping_vmops);
+
+ return PTR_ERR_OR_ZERO(vma);
+}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->root->rwsem. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->root->rwsem.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_root.rb_node))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_lock in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_lock until mm_drop_all_locks() returns.
+ *
+ * mmap_lock in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout. It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We take locks in following order, accordingly to comment at beginning
+ * of mm/rmap.c:
+ * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+ * hugetlb mapping);
+ * - all i_mmap_rwsem locks;
+ * - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(mmap_read_trylock(mm));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ !is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_lock_anon_vma(mm, avc->anon_vma);
+ }
+
+ return 0;
+
+out_unlock:
+ mm_drop_all_locks(mm);
+ return -EINTR;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->rb_root will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->root->rwsem.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_root.rb_node))
+ BUG();
+ anon_vma_unlock_write(anon_vma);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ i_mmap_unlock_write(mapping);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_lock cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(mmap_read_trylock(mm));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_unlock_anon_vma(avc->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
+
+/*
+ * initialise the percpu counter for VM
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
+ VM_BUG_ON(ret);
+}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+subsys_initcall(init_user_reserve);
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+subsys_initcall(init_admin_reserve);
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
+
+ return 0;
+}
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
new file mode 100644
index 000000000..205fdbb57
--- /dev/null
+++ b/mm/mmu_gather.c
@@ -0,0 +1,332 @@
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/pagemap.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+
+static bool tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return true;
+ }
+
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return false;
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return false;
+
+ tlb->batch_count++;
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return true;
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+static void tlb_batch_list_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+{
+ struct mmu_gather_batch *batch;
+
+ VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
+ VM_WARN_ON(tlb->page_size != page_size);
+#endif
+
+ batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return true;
+ batch = tlb->active;
+ }
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+
+ return false;
+}
+
+#endif /* MMU_GATHER_NO_GATHER */
+
+#ifdef CONFIG_MMU_GATHER_TABLE_FREE
+
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ int i;
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
+
+/*
+ * Semi RCU freeing of the page directories.
+ *
+ * This is needed by some architectures to implement software pagetable walkers.
+ *
+ * gup_fast() and other software pagetable walkers do a lockless page-table
+ * walk and therefore needs some synchronization with the freeing of the page
+ * directories. The chosen means to accomplish that is by disabling IRQs over
+ * the walk.
+ *
+ * Architectures that use IPIs to flush TLBs will then automagically DTRT,
+ * since we unlink the page, flush TLBs, free the page. Since the disabling of
+ * IRQs delays the completion of the TLB flush we can never observe an already
+ * freed page.
+ *
+ * Architectures that do not have this (PPC) need to delay the freeing by some
+ * other means, this is that means.
+ *
+ * What we do is batch the freed directory pages (tables) and RCU free them.
+ * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
+ * holds off grace periods.
+ *
+ * However, in order to batch these pages we need to allocate storage, this
+ * allocation is deep inside the MM code and can thus easily fail on memory
+ * pressure. To guarantee progress we fall back to single table freeing, see
+ * the implementation of tlb_remove_table_one().
+ *
+ */
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+void tlb_remove_table_sync_one(void)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
+}
+
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ call_rcu(&batch->rcu, tlb_remove_table_rcu);
+}
+
+#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ __tlb_remove_table_free(batch);
+}
+
+#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ tlb_remove_table_sync_one();
+ __tlb_remove_table(table);
+}
+
+static void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ tlb_table_invalidate(tlb);
+ tlb_remove_table_free(*batch);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_table_invalidate(tlb);
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+static inline void tlb_table_init(struct mmu_gather *tlb)
+{
+ tlb->batch = NULL;
+}
+
+#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
+
+static inline void tlb_table_flush(struct mmu_gather *tlb) { }
+static inline void tlb_table_init(struct mmu_gather *tlb) { }
+
+#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
+
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+ tlb_table_flush(tlb);
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb_batch_pages_flush(tlb);
+#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ tlb->mm = mm;
+
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
+
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb->need_flush_all = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+#endif
+
+ tlb_table_init(tlb);
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
+ tlb->page_size = 0;
+#endif
+
+ __tlb_reset_range(tlb);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+/**
+ * tlb_finish_mmu - finish an mmu_gather structure
+ * @tlb: the mmu_gather structure to finish
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called at the end of the shootdown operation to free up any resources that
+ * were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
+ * flush by batching, one thread may end up seeing inconsistent PTEs
+ * and result in having stale TLB entries. So flush TLB forcefully
+ * if we detect parallel PTE batching threads.
+ *
+ * However, some syscalls, e.g. munmap(), may free page tables, this
+ * needs force flush everything in the given range. Otherwise this
+ * may result in having stale TLB entries for some architectures,
+ * e.g. aarch64, that could specify flush what level TLB.
+ */
+ if (mm_tlb_flush_nested(tlb->mm)) {
+ /*
+ * The aarch64 yields better performance with fullmm by
+ * avoiding multiple CPUs spamming TLBI messages at the
+ * same time.
+ *
+ * On x86 non-fullmm doesn't yield significant difference
+ * against fullmm.
+ */
+ tlb->fullmm = 1;
+ __tlb_reset_range(tlb);
+ tlb->freed_tables = 1;
+ }
+
+ tlb_flush_mmu(tlb);
+
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
+ tlb_batch_list_free(tlb);
+#endif
+ dec_tlb_flush_pending(tlb->mm);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000..9165ca619
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,1139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <cl@linux.com>
+ */
+
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/interval_tree.h>
+#include <linux/srcu.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+
+/* global SRCU for all MMs */
+DEFINE_STATIC_SRCU(srcu);
+
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
+ .name = "mmu_notifier_invalidate_range_start"
+};
+#endif
+
+/*
+ * The mmu_notifier_subscriptions structure is allocated and installed in
+ * mm->notifier_subscriptions inside the mm_take_all_locks() protected
+ * critical section and it's released only when mm_count reaches zero
+ * in mmdrop().
+ */
+struct mmu_notifier_subscriptions {
+ /* all mmu notifiers registered in this mm are queued in this list */
+ struct hlist_head list;
+ bool has_itree;
+ /* to serialize the list modifications and hlist_unhashed */
+ spinlock_t lock;
+ unsigned long invalidate_seq;
+ unsigned long active_invalidate_ranges;
+ struct rb_root_cached itree;
+ wait_queue_head_t wq;
+ struct hlist_head deferred_list;
+};
+
+/*
+ * This is a collision-retry read-side/write-side 'lock', a lot like a
+ * seqcount, however this allows multiple write-sides to hold it at
+ * once. Conceptually the write side is protecting the values of the PTEs in
+ * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
+ * writer exists.
+ *
+ * Note that the core mm creates nested invalidate_range_start()/end() regions
+ * within the same thread, and runs invalidate_range_start()/end() in parallel
+ * on multiple CPUs. This is designed to not reduce concurrency or block
+ * progress on the mm side.
+ *
+ * As a secondary function, holding the full write side also serves to prevent
+ * writers for the itree, this is an optimization to avoid extra locking
+ * during invalidate_range_start/end notifiers.
+ *
+ * The write side has two states, fully excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - subscriptions->invalidate_seq & 1 == True (odd)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is not allowed to change
+ *
+ * And partially excluded:
+ * - mm->active_invalidate_ranges != 0
+ * - subscriptions->invalidate_seq & 1 == False (even)
+ * - some range on the mm_struct is being invalidated
+ * - the itree is allowed to change
+ *
+ * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
+ * seq |= 1 # Begin writing
+ * seq++ # Release the writing state
+ * seq & 1 # True if a writer exists
+ *
+ * The later state avoids some expensive work on inv_end in the common case of
+ * no mmu_interval_notifier monitoring the VA.
+ */
+static bool
+mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
+{
+ lockdep_assert_held(&subscriptions->lock);
+ return subscriptions->invalidate_seq & 1;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
+ const struct mmu_notifier_range *range,
+ unsigned long *seq)
+{
+ struct interval_tree_node *node;
+ struct mmu_interval_notifier *res = NULL;
+
+ spin_lock(&subscriptions->lock);
+ subscriptions->active_invalidate_ranges++;
+ node = interval_tree_iter_first(&subscriptions->itree, range->start,
+ range->end - 1);
+ if (node) {
+ subscriptions->invalidate_seq |= 1;
+ res = container_of(node, struct mmu_interval_notifier,
+ interval_tree);
+ }
+
+ *seq = subscriptions->invalidate_seq;
+ spin_unlock(&subscriptions->lock);
+ return res;
+}
+
+static struct mmu_interval_notifier *
+mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
+ const struct mmu_notifier_range *range)
+{
+ struct interval_tree_node *node;
+
+ node = interval_tree_iter_next(&interval_sub->interval_tree,
+ range->start, range->end - 1);
+ if (!node)
+ return NULL;
+ return container_of(node, struct mmu_interval_notifier, interval_tree);
+}
+
+static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
+{
+ struct mmu_interval_notifier *interval_sub;
+ struct hlist_node *next;
+
+ spin_lock(&subscriptions->lock);
+ if (--subscriptions->active_invalidate_ranges ||
+ !mn_itree_is_invalidating(subscriptions)) {
+ spin_unlock(&subscriptions->lock);
+ return;
+ }
+
+ /* Make invalidate_seq even */
+ subscriptions->invalidate_seq++;
+
+ /*
+ * The inv_end incorporates a deferred mechanism like rtnl_unlock().
+ * Adds and removes are queued until the final inv_end happens then
+ * they are progressed. This arrangement for tree updates is used to
+ * avoid using a blocking lock during invalidate_range_start.
+ */
+ hlist_for_each_entry_safe(interval_sub, next,
+ &subscriptions->deferred_list,
+ deferred_item) {
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ else
+ interval_tree_remove(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ hlist_del(&interval_sub->deferred_item);
+ }
+ spin_unlock(&subscriptions->lock);
+
+ wake_up_all(&subscriptions->wq);
+}
+
+/**
+ * mmu_interval_read_begin - Begin a read side critical section against a VA
+ * range
+ * @interval_sub: The interval subscription
+ *
+ * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
+ * collision-retry scheme similar to seqcount for the VA range under
+ * subscription. If the mm invokes invalidation during the critical section
+ * then mmu_interval_read_retry() will return true.
+ *
+ * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
+ * require a blocking context. The critical region formed by this can sleep,
+ * and the required 'user_lock' can also be a sleeping lock.
+ *
+ * The caller is required to provide a 'user_lock' to serialize both teardown
+ * and setup.
+ *
+ * The return value should be passed to mmu_interval_read_retry().
+ */
+unsigned long
+mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ interval_sub->mm->notifier_subscriptions;
+ unsigned long seq;
+ bool is_invalidating;
+
+ /*
+ * If the subscription has a different seq value under the user_lock
+ * than we started with then it has collided.
+ *
+ * If the subscription currently has the same seq value as the
+ * subscriptions seq, then it is currently between
+ * invalidate_start/end and is colliding.
+ *
+ * The locking looks broadly like this:
+ * mn_tree_invalidate_start(): mmu_interval_read_begin():
+ * spin_lock
+ * seq = READ_ONCE(interval_sub->invalidate_seq);
+ * seq == subs->invalidate_seq
+ * spin_unlock
+ * spin_lock
+ * seq = ++subscriptions->invalidate_seq
+ * spin_unlock
+ * op->invalidate_range():
+ * user_lock
+ * mmu_interval_set_seq()
+ * interval_sub->invalidate_seq = seq
+ * user_unlock
+ *
+ * [Required: mmu_interval_read_retry() == true]
+ *
+ * mn_itree_inv_end():
+ * spin_lock
+ * seq = ++subscriptions->invalidate_seq
+ * spin_unlock
+ *
+ * user_lock
+ * mmu_interval_read_retry():
+ * interval_sub->invalidate_seq != seq
+ * user_unlock
+ *
+ * Barriers are not needed here as any races here are closed by an
+ * eventual mmu_interval_read_retry(), which provides a barrier via the
+ * user_lock.
+ */
+ spin_lock(&subscriptions->lock);
+ /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
+ seq = READ_ONCE(interval_sub->invalidate_seq);
+ is_invalidating = seq == subscriptions->invalidate_seq;
+ spin_unlock(&subscriptions->lock);
+
+ /*
+ * interval_sub->invalidate_seq must always be set to an odd value via
+ * mmu_interval_set_seq() using the provided cur_seq from
+ * mn_itree_inv_start_range(). This ensures that if seq does wrap we
+ * will always clear the below sleep in some reasonable time as
+ * subscriptions->invalidate_seq is even in the idle state.
+ */
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ if (is_invalidating)
+ wait_event(subscriptions->wq,
+ READ_ONCE(subscriptions->invalidate_seq) != seq);
+
+ /*
+ * Notice that mmu_interval_read_retry() can already be true at this
+ * point, avoiding loops here allows the caller to provide a global
+ * time bound.
+ */
+
+ return seq;
+}
+EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
+
+static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier_range range = {
+ .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
+ .event = MMU_NOTIFY_RELEASE,
+ .mm = mm,
+ .start = 0,
+ .end = ULONG_MAX,
+ };
+ struct mmu_interval_notifier *interval_sub;
+ unsigned long cur_seq;
+ bool ret;
+
+ for (interval_sub =
+ mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
+ interval_sub;
+ interval_sub = mn_itree_inv_next(interval_sub, &range)) {
+ ret = interval_sub->ops->invalidate(interval_sub, &range,
+ cur_seq);
+ WARN_ON(!ret);
+ }
+
+ mn_itree_inv_end(subscriptions);
+}
+
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the notifier_subscriptions->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier *subscription;
+ int id;
+
+ /*
+ * SRCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu))
+ /*
+ * If ->release runs before mmu_notifier_unregister it must be
+ * handled, as it's the only way for the driver to flush all
+ * existing sptes and stop the driver from establishing any more
+ * sptes before all the pages in the mm are freed.
+ */
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+
+ spin_lock(&subscriptions->lock);
+ while (unlikely(!hlist_empty(&subscriptions->list))) {
+ subscription = hlist_entry(subscriptions->list.first,
+ struct mmu_notifier, hlist);
+ /*
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than to wait
+ * for ->release to finish and for mmu_notifier_unregister to
+ * return.
+ */
+ hlist_del_init_rcu(&subscription->hlist);
+ }
+ spin_unlock(&subscriptions->lock);
+ srcu_read_unlock(&srcu, id);
+
+ /*
+ * synchronize_srcu here prevents mmu_notifier_release from returning to
+ * exit_mmap (which would proceed with freeing all pages in the mm)
+ * until the ->release method returns, if it was invoked by
+ * mmu_notifier_unregister.
+ *
+ * The notifier_subscriptions can't go away from under us because
+ * one mm_count is held by exit_mmap.
+ */
+ synchronize_srcu(&srcu);
+}
+
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+
+ if (subscriptions->has_itree)
+ mn_itree_release(subscriptions, mm);
+
+ if (!hlist_empty(&subscriptions->list))
+ mn_hlist_release(subscriptions, mm);
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_notifier *subscription;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ if (subscription->ops->clear_flush_young)
+ young |= subscription->ops->clear_flush_young(
+ subscription, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_notifier *subscription;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ if (subscription->ops->clear_young)
+ young |= subscription->ops->clear_young(subscription,
+ mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *subscription;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ if (subscription->ops->test_young) {
+ young = subscription->ops->test_young(subscription, mm,
+ address);
+ if (young)
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
+{
+ struct mmu_notifier *subscription;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ if (subscription->ops->change_pte)
+ subscription->ops->change_pte(subscription, mm, address,
+ pte);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
+ const struct mmu_notifier_range *range)
+{
+ struct mmu_interval_notifier *interval_sub;
+ unsigned long cur_seq;
+
+ for (interval_sub =
+ mn_itree_inv_start_range(subscriptions, range, &cur_seq);
+ interval_sub;
+ interval_sub = mn_itree_inv_next(interval_sub, range)) {
+ bool ret;
+
+ ret = interval_sub->ops->invalidate(interval_sub, range,
+ cur_seq);
+ if (!ret) {
+ if (WARN_ON(mmu_notifier_range_blockable(range)))
+ continue;
+ goto out_would_block;
+ }
+ }
+ return 0;
+
+out_would_block:
+ /*
+ * On -EAGAIN the non-blocking caller is not allowed to call
+ * invalidate_range_end()
+ */
+ mn_itree_inv_end(subscriptions);
+ return -EAGAIN;
+}
+
+static int mn_hlist_invalidate_range_start(
+ struct mmu_notifier_subscriptions *subscriptions,
+ struct mmu_notifier_range *range)
+{
+ struct mmu_notifier *subscription;
+ int ret = 0;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ const struct mmu_notifier_ops *ops = subscription->ops;
+
+ if (ops->invalidate_range_start) {
+ int _ret;
+
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
+ _ret = ops->invalidate_range_start(subscription, range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
+ if (_ret) {
+ pr_info("%pS callback failed with %d in %sblockable context.\n",
+ ops->invalidate_range_start, _ret,
+ !mmu_notifier_range_blockable(range) ?
+ "non-" :
+ "");
+ WARN_ON(mmu_notifier_range_blockable(range) ||
+ _ret != -EAGAIN);
+ /*
+ * We call all the notifiers on any EAGAIN,
+ * there is no way for a notifier to know if
+ * its start method failed, thus a start that
+ * does EAGAIN can't also do end.
+ */
+ WARN_ON(ops->invalidate_range_end);
+ ret = _ret;
+ }
+ }
+ }
+
+ if (ret) {
+ /*
+ * Must be non-blocking to get here. If there are multiple
+ * notifiers and one or more failed start, any that succeeded
+ * start are expecting their end to be called. Do so now.
+ */
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list,
+ hlist, srcu_read_lock_held(&srcu)) {
+ if (!subscription->ops->invalidate_range_end)
+ continue;
+
+ subscription->ops->invalidate_range_end(subscription,
+ range);
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return ret;
+}
+
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ range->mm->notifier_subscriptions;
+ int ret;
+
+ if (subscriptions->has_itree) {
+ ret = mn_itree_invalidate(subscriptions, range);
+ if (ret)
+ return ret;
+ }
+ if (!hlist_empty(&subscriptions->list))
+ return mn_hlist_invalidate_range_start(subscriptions, range);
+ return 0;
+}
+
+static void
+mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
+ struct mmu_notifier_range *range, bool only_end)
+{
+ struct mmu_notifier *subscription;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ /*
+ * Call invalidate_range here too to avoid the need for the
+ * subsystem of having to register an invalidate_range_end
+ * call-back when there is invalidate_range already. Usually a
+ * subsystem registers either invalidate_range_start()/end() or
+ * invalidate_range(), so this will be no additional overhead
+ * (besides the pointer check).
+ *
+ * We skip call to invalidate_range() if we know it is safe ie
+ * call site use mmu_notifier_invalidate_range_only_end() which
+ * is safe to do when we know that a call to invalidate_range()
+ * already happen under page table lock.
+ */
+ if (!only_end && subscription->ops->invalidate_range)
+ subscription->ops->invalidate_range(subscription,
+ range->mm,
+ range->start,
+ range->end);
+ if (subscription->ops->invalidate_range_end) {
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
+ subscription->ops->invalidate_range_end(subscription,
+ range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
+ bool only_end)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ range->mm->notifier_subscriptions;
+
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ if (subscriptions->has_itree)
+ mn_itree_inv_end(subscriptions);
+
+ if (!hlist_empty(&subscriptions->list))
+ mn_hlist_invalidate_end(subscriptions, range, only_end);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+}
+
+void __mmu_notifier_invalidate_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *subscription;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
+ if (subscription->ops->invalidate_range)
+ subscription->ops->invalidate_range(subscription, mm,
+ start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+/*
+ * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
+ * write mode. A NULL mn signals the notifier is being registered for itree
+ * mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier_subscriptions *subscriptions = NULL;
+ int ret;
+
+ mmap_assert_write_locked(mm);
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ fs_reclaim_release(GFP_KERNEL);
+ }
+
+ if (!mm->notifier_subscriptions) {
+ /*
+ * kmalloc cannot be called under mm_take_all_locks(), but we
+ * know that mm->notifier_subscriptions can't change while we
+ * hold the write side of the mmap_lock.
+ */
+ subscriptions = kzalloc(
+ sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
+ if (!subscriptions)
+ return -ENOMEM;
+
+ INIT_HLIST_HEAD(&subscriptions->list);
+ spin_lock_init(&subscriptions->lock);
+ subscriptions->invalidate_seq = 2;
+ subscriptions->itree = RB_ROOT_CACHED;
+ init_waitqueue_head(&subscriptions->wq);
+ INIT_HLIST_HEAD(&subscriptions->deferred_list);
+ }
+
+ ret = mm_take_all_locks(mm);
+ if (unlikely(ret))
+ goto out_clean;
+
+ /*
+ * Serialize the update against mmu_notifier_unregister. A
+ * side note: mmu_notifier_release can't run concurrently with
+ * us because we hold the mm_users pin (either implicitly as
+ * current->mm or explicitly with get_task_mm() or similar).
+ * We can't race against any other mmu notifier method either
+ * thanks to mm_take_all_locks().
+ *
+ * release semantics on the initialization of the
+ * mmu_notifier_subscriptions's contents are provided for unlocked
+ * readers. acquire can only be used while holding the mmgrab or
+ * mmget, and is safe because once created the
+ * mmu_notifier_subscriptions is not freed until the mm is destroyed.
+ * As above, users holding the mmap_lock or one of the
+ * mm_take_all_locks() do not need to use acquire semantics.
+ */
+ if (subscriptions)
+ smp_store_release(&mm->notifier_subscriptions, subscriptions);
+
+ if (subscription) {
+ /* Pairs with the mmdrop in mmu_notifier_unregister_* */
+ mmgrab(mm);
+ subscription->mm = mm;
+ subscription->users = 1;
+
+ spin_lock(&mm->notifier_subscriptions->lock);
+ hlist_add_head_rcu(&subscription->hlist,
+ &mm->notifier_subscriptions->list);
+ spin_unlock(&mm->notifier_subscriptions->lock);
+ } else
+ mm->notifier_subscriptions->has_itree = true;
+
+ mm_drop_all_locks(mm);
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return 0;
+
+out_clean:
+ kfree(subscriptions);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+
+/**
+ * mmu_notifier_register - Register a notifier on a mm
+ * @subscription: The notifier to attach
+ * @mm: The mm to attach the notifier to
+ *
+ * Must not hold mmap_lock nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns.
+ *
+ * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
+ * unregister the notifier.
+ *
+ * While the caller has a mmu_notifier get the subscription->mm pointer will remain
+ * valid, and can be converted to an active mm pointer via mmget_not_zero().
+ */
+int mmu_notifier_register(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ int ret;
+
+ mmap_write_lock(mm);
+ ret = __mmu_notifier_register(subscription, mm);
+ mmap_write_unlock(mm);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+static struct mmu_notifier *
+find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
+{
+ struct mmu_notifier *subscription;
+
+ spin_lock(&mm->notifier_subscriptions->lock);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist,
+ lockdep_is_held(&mm->notifier_subscriptions->lock)) {
+ if (subscription->ops != ops)
+ continue;
+
+ if (likely(subscription->users != UINT_MAX))
+ subscription->users++;
+ else
+ subscription = ERR_PTR(-EOVERFLOW);
+ spin_unlock(&mm->notifier_subscriptions->lock);
+ return subscription;
+ }
+ spin_unlock(&mm->notifier_subscriptions->lock);
+ return NULL;
+}
+
+/**
+ * mmu_notifier_get_locked - Return the single struct mmu_notifier for
+ * the mm & ops
+ * @ops: The operations struct being subscribe with
+ * @mm : The mm to attach notifiers too
+ *
+ * This function either allocates a new mmu_notifier via
+ * ops->alloc_notifier(), or returns an already existing notifier on the
+ * list. The value of the ops pointer is used to determine when two notifiers
+ * are the same.
+ *
+ * Each call to mmu_notifier_get() must be paired with a call to
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
+ *
+ * While the caller has a mmu_notifier get the mm pointer will remain valid,
+ * and can be converted to an active mm pointer via mmget_not_zero().
+ */
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier *subscription;
+ int ret;
+
+ mmap_assert_write_locked(mm);
+
+ if (mm->notifier_subscriptions) {
+ subscription = find_get_mmu_notifier(mm, ops);
+ if (subscription)
+ return subscription;
+ }
+
+ subscription = ops->alloc_notifier(mm);
+ if (IS_ERR(subscription))
+ return subscription;
+ subscription->ops = ops;
+ ret = __mmu_notifier_register(subscription, mm);
+ if (ret)
+ goto out_free;
+ return subscription;
+out_free:
+ subscription->ops->free_notifier(subscription);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
+
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
+{
+ BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list));
+ kfree(mm->notifier_subscriptions);
+ mm->notifier_subscriptions = LIST_POISON1; /* debug */
+}
+
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ if (!hlist_unhashed(&subscription->hlist)) {
+ /*
+ * SRCU here will force exit_mmap to wait for ->release to
+ * finish before freeing the pages.
+ */
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ /*
+ * exit_mmap will block in mmu_notifier_release to guarantee
+ * that ->release is called before freeing the pages.
+ */
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
+ srcu_read_unlock(&srcu, id);
+
+ spin_lock(&mm->notifier_subscriptions->lock);
+ /*
+ * Can not use list_del_rcu() since __mmu_notifier_release
+ * can delete it before we hold the lock.
+ */
+ hlist_del_init_rcu(&subscription->hlist);
+ spin_unlock(&mm->notifier_subscriptions->lock);
+ }
+
+ /*
+ * Wait for any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_release instead of us.
+ */
+ synchronize_srcu(&srcu);
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static void mmu_notifier_free_rcu(struct rcu_head *rcu)
+{
+ struct mmu_notifier *subscription =
+ container_of(rcu, struct mmu_notifier, rcu);
+ struct mm_struct *mm = subscription->mm;
+
+ subscription->ops->free_notifier(subscription);
+ /* Pairs with the get in __mmu_notifier_register() */
+ mmdrop(mm);
+}
+
+/**
+ * mmu_notifier_put - Release the reference on the notifier
+ * @subscription: The notifier to act on
+ *
+ * This function must be paired with each mmu_notifier_get(), it releases the
+ * reference obtained by the get. If this is the last reference then process
+ * to free the notifier will be run asynchronously.
+ *
+ * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
+ * when the mm_struct is destroyed. Instead free_notifier is always called to
+ * release any resources held by the user.
+ *
+ * As ops->release is not guaranteed to be called, the user must ensure that
+ * all sptes are dropped, and no new sptes can be established before
+ * mmu_notifier_put() is called.
+ *
+ * This function can be called from the ops->release callback, however the
+ * caller must still ensure it is called pairwise with mmu_notifier_get().
+ *
+ * Modules calling this function must call mmu_notifier_synchronize() in
+ * their __exit functions to ensure the async work is completed.
+ */
+void mmu_notifier_put(struct mmu_notifier *subscription)
+{
+ struct mm_struct *mm = subscription->mm;
+
+ spin_lock(&mm->notifier_subscriptions->lock);
+ if (WARN_ON(!subscription->users) || --subscription->users)
+ goto out_unlock;
+ hlist_del_init_rcu(&subscription->hlist);
+ spin_unlock(&mm->notifier_subscriptions->lock);
+
+ call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu);
+ return;
+
+out_unlock:
+ spin_unlock(&mm->notifier_subscriptions->lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_put);
+
+static int __mmu_interval_notifier_insert(
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
+ struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
+ unsigned long length, const struct mmu_interval_notifier_ops *ops)
+{
+ interval_sub->mm = mm;
+ interval_sub->ops = ops;
+ RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
+ interval_sub->interval_tree.start = start;
+ /*
+ * Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval.
+ */
+ if (length == 0 ||
+ check_add_overflow(start, length - 1,
+ &interval_sub->interval_tree.last))
+ return -EOVERFLOW;
+
+ /* Must call with a mmget() held */
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
+ return -EINVAL;
+
+ /* pairs with mmdrop in mmu_interval_notifier_remove() */
+ mmgrab(mm);
+
+ /*
+ * If some invalidate_range_start/end region is going on in parallel
+ * we don't know what VA ranges are affected, so we must assume this
+ * new range is included.
+ *
+ * If the itree is invalidating then we are not allowed to change
+ * it. Retrying until invalidation is done is tricky due to the
+ * possibility for live lock, instead defer the add to
+ * mn_itree_inv_end() so this algorithm is deterministic.
+ *
+ * In all cases the value for the interval_sub->invalidate_seq should be
+ * odd, see mmu_interval_read_begin()
+ */
+ spin_lock(&subscriptions->lock);
+ if (subscriptions->active_invalidate_ranges) {
+ if (mn_itree_is_invalidating(subscriptions))
+ hlist_add_head(&interval_sub->deferred_item,
+ &subscriptions->deferred_list);
+ else {
+ subscriptions->invalidate_seq |= 1;
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ }
+ interval_sub->invalidate_seq = subscriptions->invalidate_seq;
+ } else {
+ WARN_ON(mn_itree_is_invalidating(subscriptions));
+ /*
+ * The starting seq for a subscription not under invalidation
+ * should be odd, not equal to the current invalidate_seq and
+ * invalidate_seq should not 'wrap' to the new seq any time
+ * soon.
+ */
+ interval_sub->invalidate_seq =
+ subscriptions->invalidate_seq - 1;
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ }
+ spin_unlock(&subscriptions->lock);
+ return 0;
+}
+
+/**
+ * mmu_interval_notifier_insert - Insert an interval notifier
+ * @interval_sub: Interval subscription to register
+ * @start: Starting virtual address to monitor
+ * @length: Length of the range to monitor
+ * @mm: mm_struct to attach to
+ * @ops: Interval notifier operations to be called on matching events
+ *
+ * This function subscribes the interval notifier for notifications from the
+ * mm. Upon return the ops related to mmu_interval_notifier will be called
+ * whenever an event that intersects with the given range occurs.
+ *
+ * Upon return the range_notifier may not be present in the interval tree yet.
+ * The caller must use the normal interval notifier read flow via
+ * mmu_interval_read_begin() to establish SPTEs for this range.
+ */
+int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct mmu_notifier_subscriptions *subscriptions;
+ int ret;
+
+ might_lock(&mm->mmap_lock);
+
+ subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
+ if (!subscriptions || !subscriptions->has_itree) {
+ ret = mmu_notifier_register(NULL, mm);
+ if (ret)
+ return ret;
+ subscriptions = mm->notifier_subscriptions;
+ }
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
+ start, length, ops);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
+
+int mmu_interval_notifier_insert_locked(
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
+ unsigned long start, unsigned long length,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+ int ret;
+
+ mmap_assert_write_locked(mm);
+
+ if (!subscriptions || !subscriptions->has_itree) {
+ ret = __mmu_notifier_register(NULL, mm);
+ if (ret)
+ return ret;
+ subscriptions = mm->notifier_subscriptions;
+ }
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
+ start, length, ops);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+ unsigned long seq)
+{
+ bool ret;
+
+ spin_lock(&subscriptions->lock);
+ ret = subscriptions->invalidate_seq != seq;
+ spin_unlock(&subscriptions->lock);
+ return ret;
+}
+
+/**
+ * mmu_interval_notifier_remove - Remove a interval notifier
+ * @interval_sub: Interval subscription to unregister
+ *
+ * This function must be paired with mmu_interval_notifier_insert(). It cannot
+ * be called from any ops callback.
+ *
+ * Once this returns ops callbacks are no longer running on other CPUs and
+ * will not be called in future.
+ */
+void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
+{
+ struct mm_struct *mm = interval_sub->mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
+ unsigned long seq = 0;
+
+ might_sleep();
+
+ spin_lock(&subscriptions->lock);
+ if (mn_itree_is_invalidating(subscriptions)) {
+ /*
+ * remove is being called after insert put this on the
+ * deferred list, but before the deferred list was processed.
+ */
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) {
+ hlist_del(&interval_sub->deferred_item);
+ } else {
+ hlist_add_head(&interval_sub->deferred_item,
+ &subscriptions->deferred_list);
+ seq = subscriptions->invalidate_seq;
+ }
+ } else {
+ WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb));
+ interval_tree_remove(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ }
+ spin_unlock(&subscriptions->lock);
+
+ /*
+ * The possible sleep on progress in the invalidation requires the
+ * caller not hold any locks held by invalidation callbacks.
+ */
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ if (seq)
+ wait_event(subscriptions->wq,
+ mmu_interval_seq_released(subscriptions, seq));
+
+ /* pairs with mmgrab in mmu_interval_notifier_insert() */
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);
+
+/**
+ * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
+ *
+ * This function ensures that all outstanding async SRU work from
+ * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
+ * associated with an unused mmu_notifier will no longer be called.
+ *
+ * Before using the caller must ensure that all of its mmu_notifiers have been
+ * fully released via mmu_notifier_put().
+ *
+ * Modules using the mmu_notifier_put() API should call this in their __exit
+ * function to avoid module unloading races.
+ */
+void mmu_notifier_synchronize(void)
+{
+ synchronize_srcu(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+ return false;
+ /* Return true if the vma still have the read flag set. */
+ return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000..f337831af
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats, zones and page flags
+ */
+
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+ return NODE_DATA(first_online_node);
+}
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+ int nid = next_online_node(pgdat->node_id);
+
+ if (nid == MAX_NUMNODES)
+ return NULL;
+ return NODE_DATA(nid);
+}
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+ zone++;
+ else {
+ pgdat = next_online_pgdat(pgdat);
+ if (pgdat)
+ zone = pgdat->node_zones;
+ else
+ zone = NULL;
+ }
+ return zone;
+}
+
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+ return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+ return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes)
+{
+ /*
+ * Find the next suitable zone to use for the allocation.
+ * Only filter based on nodemask if it's set
+ */
+ if (unlikely(nodes == NULL))
+ while (zonelist_zone_idx(z) > highest_zoneidx)
+ z++;
+ else
+ while (zonelist_zone_idx(z) > highest_zoneidx ||
+ (z->zone && !zref_in_nodemask(z, nodes)))
+ z++;
+
+ return z;
+}
+
+void lruvec_init(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+
+ memset(lruvec, 0, sizeof(struct lruvec));
+
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&lruvec->lists[lru]);
+}
+
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+int page_cpupid_xchg_last(struct page *page, int cpupid)
+{
+ unsigned long old_flags, flags;
+ int last_cpupid;
+
+ do {
+ old_flags = flags = page->flags;
+ last_cpupid = page_cpupid_last(page);
+
+ flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
+ flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+
+ return last_cpupid;
+}
+#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
new file mode 100644
index 000000000..7ea0aee0c
--- /dev/null
+++ b/mm/mprotect.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/mprotect.c
+ *
+ * (C) Copyright 1994 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
+ *
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
+ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/pagewalk.h>
+#include <linux/hugetlb.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/mempolicy.h>
+#include <linux/personality.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
+#include <linux/perf_event.h>
+#include <linux/pkeys.h>
+#include <linux/ksm.h>
+#include <linux/uaccess.h>
+#include <linux/mm_inline.h>
+#include <linux/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags)
+{
+ pte_t *pte, oldpte;
+ spinlock_t *ptl;
+ unsigned long pages = 0;
+ int target_node = NUMA_NO_NODE;
+ bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+
+ /*
+ * Can be called with only the mmap_lock for reading by
+ * prot_numa so we must check the pmd isn't constantly
+ * changing from under us from pmd_none to pmd_trans_huge
+ * and/or the other way around.
+ */
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ /*
+ * The pmd points to a regular pte so the pmd can't change
+ * from under us even if the mmap_lock is only hold for
+ * reading.
+ */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+
+ /* Get target node for single threaded private VMAs */
+ if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+ atomic_read(&vma->vm_mm->mm_users) == 1)
+ target_node = numa_node_id();
+
+ flush_tlb_batched_pending(vma->vm_mm);
+ arch_enter_lazy_mmu_mode();
+ do {
+ oldpte = *pte;
+ if (pte_present(oldpte)) {
+ pte_t ptent;
+ bool preserve_write = prot_numa && pte_write(oldpte);
+
+ /*
+ * Avoid trapping faults against the zero or KSM
+ * pages. See similar comment in change_huge_pmd.
+ */
+ if (prot_numa) {
+ struct page *page;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ continue;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (!page || PageKsm(page))
+ continue;
+
+ /* Also skip shared copy-on-write pages */
+ if (is_cow_mapping(vma->vm_flags) &&
+ page_count(page) != 1)
+ continue;
+
+ /*
+ * While migration can move some dirty pages,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (page_is_file_lru(page) && PageDirty(page))
+ continue;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ if (target_node == page_to_nid(page))
+ continue;
+ }
+
+ oldpte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_modify(oldpte, newprot);
+ if (preserve_write)
+ ptent = pte_mk_savedwrite(ptent);
+
+ if (uffd_wp) {
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_mkuffd_wp(ptent);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled
+ * by PF interrupt handler, then
+ * things like COW could be properly
+ * handled.
+ */
+ ptent = pte_clear_uffd_wp(ptent);
+ }
+
+ /* Avoid taking write faults for known dirty pages */
+ if (dirty_accountable && pte_dirty(ptent) &&
+ (pte_soft_dirty(ptent) ||
+ !(vma->vm_flags & VM_SOFTDIRTY))) {
+ ptent = pte_mkwrite(ptent);
+ }
+ ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
+ pages++;
+ } else if (is_swap_pte(oldpte)) {
+ swp_entry_t entry = pte_to_swp_entry(oldpte);
+ pte_t newpte;
+
+ if (is_write_migration_entry(entry)) {
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ make_migration_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_write_device_private_entry(entry)) {
+ /*
+ * We do not preserve soft-dirtiness. See
+ * copy_one_pte() for explanation.
+ */
+ make_device_private_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else {
+ newpte = oldpte;
+ }
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+
+ if (!pte_same(oldpte, newpte)) {
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
+ pages++;
+ }
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+
+ return pages;
+}
+
+/*
+ * Used when setting automatic NUMA hinting protection where it is
+ * critical that a numa hinting PMD is not confused with a bad PMD.
+ */
+static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
+
+ if (pmd_none(pmdval))
+ return 1;
+ if (pmd_trans_huge(pmdval))
+ return 0;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long pages = 0;
+ unsigned long nr_huge_updates = 0;
+ struct mmu_notifier_range range;
+
+ range.start = 0;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ unsigned long this_pages;
+
+ next = pmd_addr_end(addr, end);
+
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_lock
+ * held for read. It's possible a parallel update to occur
+ * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+ * check leading to a false positive and clearing.
+ * Hence, it's necessary to atomically read the PMD value
+ * for all the checks.
+ */
+ if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+ pmd_none_or_clear_bad_unless_trans_huge(pmd))
+ goto next;
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!range.start) {
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma, vma->vm_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE) {
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
+ } else {
+ int nr_ptes = change_huge_pmd(vma, pmd, addr,
+ newprot, cp_flags);
+
+ if (nr_ptes) {
+ if (nr_ptes == HPAGE_PMD_NR) {
+ pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
+ }
+
+ /* huge pmd was handled */
+ goto next;
+ }
+ }
+ /* fall through, the trans huge pmd just split */
+ }
+ this_pages = change_pte_range(vma, pmd, addr, next, newprot,
+ cp_flags);
+ pages += this_pages;
+next:
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
+
+ if (range.start)
+ mmu_notifier_invalidate_range_end(&range);
+
+ if (nr_huge_updates)
+ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
+ return pages;
+}
+
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+ p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
+{
+ pud_t *pud;
+ unsigned long next;
+ unsigned long pages = 0;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ cp_flags);
+ } while (pud++, addr = next, addr != end);
+
+ return pages;
+}
+
+static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ unsigned long pages = 0;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ pages += change_pud_range(vma, p4d, addr, next, newprot,
+ cp_flags);
+ } while (p4d++, addr = next, addr != end);
+
+ return pages;
+}
+
+static unsigned long change_protection_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long start = addr;
+ unsigned long pages = 0;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+ flush_cache_range(vma, addr, end);
+ inc_tlb_flush_pending(mm);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ pages += change_p4d_range(vma, pgd, addr, next, newprot,
+ cp_flags);
+ } while (pgd++, addr = next, addr != end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+ dec_tlb_flush_pending(mm);
+
+ return pages;
+}
+
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ unsigned long cp_flags)
+{
+ unsigned long pages;
+
+ BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot,
+ cp_flags);
+
+ return pages;
+}
+
+static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_test(unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+static const struct mm_walk_ops prot_none_walk_ops = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+};
+
+int
+mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ unsigned long start, unsigned long end, unsigned long newflags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long oldflags = vma->vm_flags;
+ long nrpages = (end - start) >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ pgoff_t pgoff;
+ int error;
+ int dirty_accountable = 0;
+
+ if (newflags == oldflags) {
+ *pprev = vma;
+ return 0;
+ }
+
+ /*
+ * Do PROT_NONE PFN permission checks here when we can still
+ * bail out without undoing a lot of state. This is a rather
+ * uncommon case, so doesn't need to be very optimized.
+ */
+ if (arch_has_pfn_modify_check() &&
+ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (newflags & VM_ACCESS_FLAGS) == 0) {
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+
+ error = walk_page_range(current->mm, start, end,
+ &prot_none_walk_ops, &new_pgprot);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we make a private mapping writable we increase our commit;
+ * but (without finer accounting) cannot reduce our commit if we
+ * make it unwritable again. hugetlb mapping were accounted for
+ * even if read-only so there is no need to account for them here
+ */
+ if (newflags & VM_WRITE) {
+ /* Check space limits when area turns into data. */
+ if (!may_expand_vm(mm, newflags, nrpages) &&
+ may_expand_vm(mm, oldflags, nrpages))
+ return -ENOMEM;
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
+ VM_SHARED|VM_NORESERVE))) {
+ charged = nrpages;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ newflags |= VM_ACCOUNT;
+ }
+ }
+
+ /*
+ * First try to merge with previous and/or next vma.
+ */
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *pprev = vma_merge(mm, *pprev, start, end, newflags,
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
+ if (*pprev) {
+ vma = *pprev;
+ VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
+ goto success;
+ }
+
+ *pprev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto fail;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto fail;
+ }
+
+success:
+ /*
+ * vm_flags and vm_page_prot are protected by the mmap_lock
+ * held in write mode.
+ */
+ vma->vm_flags = newflags;
+ dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ vma_set_page_prot(vma);
+
+ change_protection(vma, start, end, vma->vm_page_prot,
+ dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+
+ /*
+ * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
+ * fault on access.
+ */
+ if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
+ (newflags & VM_WRITE)) {
+ populate_vma_page_range(vma, start, end, NULL);
+ }
+
+ vm_stat_account(mm, oldflags, -nrpages);
+ vm_stat_account(mm, newflags, nrpages);
+ perf_event_mmap(vma);
+ return 0;
+
+fail:
+ vm_unacct_memory(charged);
+ return error;
+}
+
+/*
+ * pkey==-1 when doing a legacy mprotect()
+ */
+static int do_mprotect_pkey(unsigned long start, size_t len,
+ unsigned long prot, int pkey)
+{
+ unsigned long nstart, end, tmp, reqprot;
+ struct vm_area_struct *vma, *prev;
+ int error = -EINVAL;
+ const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
+ (prot & PROT_READ);
+
+ start = untagged_addr(start);
+
+ prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
+ if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
+ return -EINVAL;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return 0;
+ len = PAGE_ALIGN(len);
+ end = start + len;
+ if (end <= start)
+ return -ENOMEM;
+ if (!arch_validate_prot(prot, start))
+ return -EINVAL;
+
+ reqprot = prot;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+
+ /*
+ * If userspace did not allocate the pkey, do not let
+ * them use it here.
+ */
+ error = -EINVAL;
+ if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
+ goto out;
+
+ vma = find_vma(current->mm, start);
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+ prev = vma->vm_prev;
+ if (unlikely(grows & PROT_GROWSDOWN)) {
+ if (vma->vm_start >= end)
+ goto out;
+ start = vma->vm_start;
+ error = -EINVAL;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out;
+ } else {
+ if (vma->vm_start > start)
+ goto out;
+ if (unlikely(grows & PROT_GROWSUP)) {
+ end = vma->vm_end;
+ error = -EINVAL;
+ if (!(vma->vm_flags & VM_GROWSUP))
+ goto out;
+ }
+ }
+ if (start > vma->vm_start)
+ prev = vma;
+
+ for (nstart = start ; ; ) {
+ unsigned long mask_off_old_flags;
+ unsigned long newflags;
+ int new_vma_pkey;
+
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+
+ /* Does the application expect PROT_READ to imply PROT_EXEC */
+ if (rier && (vma->vm_flags & VM_MAYEXEC))
+ prot |= PROT_EXEC;
+
+ /*
+ * Each mprotect() call explicitly passes r/w/x permissions.
+ * If a permission is not passed to mprotect(), it must be
+ * cleared from the VMA.
+ */
+ mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
+ VM_FLAGS_CLEAR;
+
+ new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
+ newflags = calc_vm_prot_bits(prot, new_vma_pkey);
+ newflags |= (vma->vm_flags & ~mask_off_old_flags);
+
+ /* newflags >> 4 shift VM_MAY% in place of VM_% */
+ if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
+ error = -EACCES;
+ goto out;
+ }
+
+ /* Allow architectures to sanity-check the new flags */
+ if (!arch_validate_flags(newflags)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = security_file_mprotect(vma, reqprot, prot);
+ if (error)
+ goto out;
+
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+ error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+ if (error)
+ goto out;
+ nstart = tmp;
+
+ if (nstart < prev->vm_end)
+ nstart = prev->vm_end;
+ if (nstart >= end)
+ goto out;
+
+ vma = prev->vm_next;
+ if (!vma || vma->vm_start != nstart) {
+ error = -ENOMEM;
+ goto out;
+ }
+ prot = reqprot;
+ }
+out:
+ mmap_write_unlock(current->mm);
+ return error;
+}
+
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+ unsigned long, prot)
+{
+ return do_mprotect_pkey(start, len, prot, -1);
+}
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
+SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
+ unsigned long, prot, int, pkey)
+{
+ return do_mprotect_pkey(start, len, prot, pkey);
+}
+
+SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
+{
+ int pkey;
+ int ret;
+
+ /* No flags supported yet. */
+ if (flags)
+ return -EINVAL;
+ /* check for unsupported init values */
+ if (init_val & ~PKEY_ACCESS_MASK)
+ return -EINVAL;
+
+ mmap_write_lock(current->mm);
+ pkey = mm_pkey_alloc(current->mm);
+
+ ret = -ENOSPC;
+ if (pkey == -1)
+ goto out;
+
+ ret = arch_set_user_pkey_access(current, pkey, init_val);
+ if (ret) {
+ mm_pkey_free(current->mm, pkey);
+ goto out;
+ }
+ ret = pkey;
+out:
+ mmap_write_unlock(current->mm);
+ return ret;
+}
+
+SYSCALL_DEFINE1(pkey_free, int, pkey)
+{
+ int ret;
+
+ mmap_write_lock(current->mm);
+ ret = mm_pkey_free(current->mm, pkey);
+ mmap_write_unlock(current->mm);
+
+ /*
+ * We could provie warnings or errors if any VMA still
+ * has the pkey set here.
+ */
+ return ret;
+}
+
+#endif /* CONFIG_ARCH_HAS_PKEYS */
diff --git a/mm/mremap.c b/mm/mremap.c
new file mode 100644
index 000000000..3334c4022
--- /dev/null
+++ b/mm/mremap.c
@@ -0,0 +1,815 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/mremap.c
+ *
+ * (C) Copyright 1996 Linus Torvalds
+ *
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
+ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/shm.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/swapops.h>
+#include <linux/highmem.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
+#include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none_or_clear_bad(pgd))
+ return NULL;
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none_or_clear_bad(p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none_or_clear_bad(pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ return pmd;
+}
+
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
+
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+
+ return pmd;
+}
+
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
+static pte_t move_soft_dirty_pte(pte_t pte)
+{
+ /*
+ * Set soft dirty bit so we can notice
+ * in userspace the ptes were moved.
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ if (pte_present(pte))
+ pte = pte_mksoft_dirty(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_mksoft_dirty(pte);
+#endif
+ return pte;
+}
+
+static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+ unsigned long old_addr, unsigned long old_end,
+ struct vm_area_struct *new_vma, pmd_t *new_pmd,
+ unsigned long new_addr, bool need_rmap_locks)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
+ bool force_flush = false;
+ unsigned long len = old_end - old_addr;
+
+ /*
+ * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
+ * locks to ensure that rmap will always observe either the old or the
+ * new ptes. This is the easiest way to avoid races with
+ * truncate_pagecache(), page migration, etc...
+ *
+ * When need_rmap_locks is false, we use other ways to avoid
+ * such races:
+ *
+ * - During exec() shift_arg_pages(), we use a specially tagged vma
+ * which rmap call sites look for using vma_is_temporary_stack().
+ *
+ * - During mremap(), new_vma is often known to be placed after vma
+ * in rmap traversal order. This ensures rmap will always observe
+ * either the old pte, or the new pte, or both (the page table locks
+ * serialize access to individual ptes, but only rmap traversal
+ * order guarantees that we won't miss both the old and new ptes).
+ */
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_lock prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ new_pte = pte_offset_map(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ flush_tlb_batched_pending(vma->vm_mm);
+ arch_enter_lazy_mmu_mode();
+
+ for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+ new_pte++, new_addr += PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
+ /*
+ * If we are remapping a valid PTE, make sure
+ * to flush TLB before we drop the PTL for the
+ * PTE.
+ *
+ * NOTE! Both old and new PTL matter: the old one
+ * for racing with page_mkclean(), the new one to
+ * make sure the physical page stays valid until
+ * the TLB entry for the old mapping has been
+ * flushed.
+ */
+ if (pte_present(pte))
+ force_flush = true;
+ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ pte = move_soft_dirty_pte(pte);
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (force_flush)
+ flush_tlb_range(vma, old_end - len, old_end);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+}
+
+#ifdef CONFIG_HAVE_MOVE_PMD
+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t pmd;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have released it.
+ *
+ * However, there's a case during execve() where we use mremap
+ * to move the initial stack, and in that case the target area
+ * may overlap the source area (always moving down).
+ *
+ * If everything is PMD-aligned, that works fine, as moving
+ * each pmd down will clear the source pmd. But if we first
+ * have a few 4kB-only pages that get moved down, and then
+ * hit the "now the rest is PMD-aligned, let's do everything
+ * one pmd at a time", we will still have the old (now empty
+ * of any 4kB pages, but still there) PMD in the page table
+ * tree.
+ *
+ * Warn on it once - because we really should try to figure
+ * out how to do this better - but then say "I won't move
+ * this pmd".
+ *
+ * One alternative might be to just unmap the target pmd at
+ * this point, and verify that it really is empty. We'll see.
+ */
+ if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pmd */
+ pmd = *old_pmd;
+ pmd_clear(old_pmd);
+
+ VM_BUG_ON(!pmd_none(*new_pmd));
+
+ /* Set the new pmd */
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#endif
+
+unsigned long move_page_tables(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len,
+ bool need_rmap_locks)
+{
+ unsigned long extent, next, old_end;
+ struct mmu_notifier_range range;
+ pmd_t *old_pmd, *new_pmd;
+
+ if (!len)
+ return 0;
+
+ old_end = old_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ old_addr, old_end);
+ mmu_notifier_invalidate_range_start(&range);
+
+ for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ cond_resched();
+ next = (old_addr + PMD_SIZE) & PMD_MASK;
+ /* even if next overflowed, extent below will be ok */
+ extent = next - old_addr;
+ if (extent > old_end - old_addr)
+ extent = old_end - old_addr;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
+ if (!new_pmd)
+ break;
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
+ if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+ moved = move_huge_pmd(vma, old_addr, new_addr,
+ old_pmd, new_pmd);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+ if (moved)
+ continue;
+ }
+ split_huge_pmd(vma, old_pmd, old_addr);
+ if (pmd_trans_unstable(old_pmd))
+ continue;
+ } else if (extent == PMD_SIZE) {
+#ifdef CONFIG_HAVE_MOVE_PMD
+ /*
+ * If the extent is PMD-sized, try to speed the move by
+ * moving at the PMD level if possible.
+ */
+ bool moved;
+
+ take_rmap_locks(vma);
+ moved = move_normal_pmd(vma, old_addr, new_addr,
+ old_pmd, new_pmd);
+ drop_rmap_locks(vma);
+ if (moved)
+ continue;
+#endif
+ }
+
+ if (pte_alloc(new_vma->vm_mm, new_pmd))
+ break;
+ move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
+ new_pmd, new_addr, need_rmap_locks);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return len + old_addr - old_end; /* how much done */
+}
+
+static unsigned long move_vma(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long old_len,
+ unsigned long new_len, unsigned long new_addr,
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma;
+ unsigned long vm_flags = vma->vm_flags;
+ unsigned long new_pgoff;
+ unsigned long moved_len;
+ unsigned long excess = 0;
+ unsigned long hiwater_vm;
+ int split = 0;
+ int err;
+ bool need_rmap_locks;
+
+ /*
+ * We'd prefer to avoid failure later on in do_munmap:
+ * which may split one vma into three before unmapping.
+ */
+ if (mm->map_count >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
+ /*
+ * Advise KSM to break any KSM pages in the area to be moved:
+ * it would be confusing if they were to turn up at the new
+ * location, where they happen to coincide with different KSM
+ * pages recently unmapped. But leave vma->vm_flags as it was,
+ * so KSM can come around to merge on vma and new_vma afterwards.
+ */
+ err = ksm_madvise(vma, old_addr, old_addr + old_len,
+ MADV_UNMERGEABLE, &vm_flags);
+ if (err)
+ return err;
+
+ new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ &need_rmap_locks);
+ if (!new_vma)
+ return -ENOMEM;
+
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+ need_rmap_locks);
+ if (moved_len < old_len) {
+ err = -ENOMEM;
+ } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ err = vma->vm_ops->mremap(new_vma);
+ }
+
+ if (unlikely(err)) {
+ /*
+ * On error, move entries back from new area to old,
+ * which will succeed since page tables still there,
+ * and then proceed to unmap new area instead of old.
+ */
+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+ true);
+ vma = new_vma;
+ old_len = new_len;
+ old_addr = new_addr;
+ new_addr = err;
+ } else {
+ mremap_userfaultfd_prep(new_vma, uf);
+ arch_remap(mm, old_addr, old_addr + old_len,
+ new_addr, new_addr + new_len);
+ }
+
+ /* Conceal VM_ACCOUNT so old reservation is not undone */
+ if (vm_flags & VM_ACCOUNT) {
+ vma->vm_flags &= ~VM_ACCOUNT;
+ excess = vma->vm_end - vma->vm_start - old_len;
+ if (old_addr > vma->vm_start &&
+ old_addr + old_len < vma->vm_end)
+ split = 1;
+ }
+
+ /*
+ * If we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len.
+ *
+ * Since total_vm is about to be raised artificially high for a
+ * moment, we need to restore high watermark afterwards: if stats
+ * are taken meanwhile, total_vm and hiwater_vm appear too high.
+ * If this were a serious issue, we'd add a flag to do_munmap().
+ */
+ hiwater_vm = mm->hiwater_vm;
+ vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
+
+ /* Tell pfnmap has moved from this vma */
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_moved(vma);
+
+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT) {
+ /* Always put back VM_ACCOUNT since we won't unmap */
+ vma->vm_flags |= VM_ACCOUNT;
+
+ vm_acct_memory(new_len >> PAGE_SHIFT);
+ }
+
+ /*
+ * VMAs can actually be merged back together in copy_vma
+ * calling merge_vma. This can happen with anonymous vmas
+ * which have not yet been faulted, so if we were to consider
+ * this VMA split we'll end up adding VM_ACCOUNT on the
+ * next VMA, which is completely unrelated if this VMA
+ * was re-merged.
+ */
+ if (split && new_vma == vma)
+ split = 0;
+
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+ /* Because we won't unmap we don't need to touch locked_vm */
+ goto out;
+ }
+
+ if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_unacct_memory(excess >> PAGE_SHIFT);
+ excess = 0;
+ }
+
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ *locked = true;
+ }
+out:
+ mm->hiwater_vm = hiwater_vm;
+
+ /* Restore VM_ACCOUNT if one or two pieces of vma left */
+ if (excess) {
+ vma->vm_flags |= VM_ACCOUNT;
+ if (split)
+ vma->vm_next->vm_flags |= VM_ACCOUNT;
+ }
+
+ return new_addr;
+}
+
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long flags,
+ unsigned long *p)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ unsigned long pgoff;
+
+ if (!vma || vma->vm_start > addr)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * !old_len is a special case where an attempt is made to 'duplicate'
+ * a mapping. This makes no sense for private mappings as it will
+ * instead create a fresh/new mapping unrelated to the original. This
+ * is contrary to the basic idea of mremap which creates new mappings
+ * based on the original. There are no known use cases for this
+ * behavior. As a result, fail such attempts.
+ */
+ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+ vma->vm_flags & VM_SHARED))
+ return ERR_PTR(-EINVAL);
+
+ if (is_vm_hugetlb_page(vma))
+ return ERR_PTR(-EINVAL);
+
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ return ERR_PTR(-EFAULT);
+
+ if (new_len == old_len)
+ return vma;
+
+ /* Need to be careful about a growing mapping */
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ return ERR_PTR(-EINVAL);
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ return ERR_PTR(-EFAULT);
+
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ locked += new_len - old_len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return ERR_PTR(-EAGAIN);
+ }
+
+ if (!may_expand_vm(mm, vma->vm_flags,
+ (new_len - old_len) >> PAGE_SHIFT))
+ return ERR_PTR(-ENOMEM);
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return ERR_PTR(-ENOMEM);
+ *p = charged;
+ }
+
+ return vma;
+}
+
+static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
+ unsigned long new_addr, unsigned long new_len, bool *locked,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap_early,
+ struct list_head *uf_unmap)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+ unsigned long map_flags = 0;
+
+ if (offset_in_page(new_addr))
+ goto out;
+
+ if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ goto out;
+
+ /* Ensure the old/new locations do not overlap */
+ if (addr + old_len > new_addr && new_addr + new_len > addr)
+ goto out;
+
+ /*
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmaping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
+ */
+ if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
+ if (flags & MREMAP_FIXED) {
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+ if (ret)
+ goto out;
+ }
+
+ if (old_len >= new_len) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
+ if (ret && old_len != new_len)
+ goto out;
+ old_len = new_len;
+ }
+
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+ if (flags & MREMAP_DONTUNMAP &&
+ !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (IS_ERR_VALUE(ret))
+ goto out1;
+
+ /* We got a new mapping */
+ if (!(flags & MREMAP_FIXED))
+ new_addr = ret;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
+ uf_unmap);
+
+ if (!(offset_in_page(ret)))
+ goto out;
+
+out1:
+ vm_unacct_memory(charged);
+
+out:
+ return ret;
+}
+
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+ unsigned long end = vma->vm_end + delta;
+ if (end < vma->vm_end) /* overflow */
+ return 0;
+ if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ return 0;
+ if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+ 0, MAP_FIXED) & ~PAGE_MASK)
+ return 0;
+ return 1;
+}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+ bool locked = false;
+ bool downgraded = false;
+ struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
+ LIST_HEAD(uf_unmap);
+
+ /*
+ * There is a deliberate asymmetry here: we strip the pointer tag
+ * from the old address but leave the new address alone. This is
+ * for consistency with mmap(), where we prevent the creation of
+ * aliasing mappings in userspace by leaving the tag bits of the
+ * mapping address intact. A non-zero tag will cause the subsequent
+ * range checks to reject the address as invalid.
+ *
+ * See Documentation/arm64/tagged-address-abi.rst for more information.
+ */
+ addr = untagged_addr(addr);
+
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return ret;
+
+ if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
+ return ret;
+
+ /*
+ * MREMAP_DONTUNMAP is always a move and it does not allow resizing
+ * in the process.
+ */
+ if (flags & MREMAP_DONTUNMAP &&
+ (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
+ return ret;
+
+
+ if (offset_in_page(addr))
+ return ret;
+
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
+
+ /*
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
+ */
+ if (!new_len)
+ return ret;
+
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+
+ if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
+ ret = mremap_to(addr, old_len, new_addr, new_len,
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
+ goto out;
+ }
+
+ /*
+ * Always allow a shrinking remap: that just unmaps
+ * the unnecessary pages..
+ * __do_munmap does all the needed commit accounting, and
+ * downgrades mmap_lock to read if so directed.
+ */
+ if (old_len >= new_len) {
+ int retval;
+
+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (retval < 0 && old_len != new_len) {
+ ret = retval;
+ goto out;
+ /* Returning 1 indicates mmap_lock is downgraded to read. */
+ } else if (retval == 1)
+ downgraded = true;
+ ret = addr;
+ goto out;
+ }
+
+ /*
+ * Ok, we need to grow..
+ */
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ /* old_len exactly to the end of the area..
+ */
+ if (old_len == vma->vm_end - addr) {
+ /* can we just expand the current mapping? */
+ if (vma_expandable(vma, new_len - old_len)) {
+ int pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ if (vma_adjust(vma, vma->vm_start, addr + new_len,
+ vma->vm_pgoff, NULL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vm_stat_account(mm, vma->vm_flags, pages);
+ if (vma->vm_flags & VM_LOCKED) {
+ mm->locked_vm += pages;
+ locked = true;
+ new_addr = addr;
+ }
+ ret = addr;
+ goto out;
+ }
+ }
+
+ /*
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it..
+ */
+ ret = -ENOMEM;
+ if (flags & MREMAP_MAYMOVE) {
+ unsigned long map_flags = 0;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+ vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (IS_ERR_VALUE(new_addr)) {
+ ret = new_addr;
+ goto out;
+ }
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr,
+ &locked, flags, &uf, &uf_unmap);
+ }
+out:
+ if (offset_in_page(ret)) {
+ vm_unacct_memory(charged);
+ locked = false;
+ }
+ if (downgraded)
+ mmap_read_unlock(current->mm);
+ else
+ mmap_write_unlock(current->mm);
+ if (locked && new_len > old_len)
+ mm_populate(new_addr + old_len, new_len - old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap_early);
+ mremap_userfaultfd_complete(&uf, addr, ret, old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap);
+ return ret;
+}
diff --git a/mm/msync.c b/mm/msync.c
new file mode 100644
index 000000000..69c6d2029
--- /dev/null
+++ b/mm/msync.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/msync.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * The msync() system call.
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+
+/*
+ * MS_SYNC syncs the entire file - including mappings.
+ *
+ * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
+ * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
+ * Now it doesn't do anything, since dirty pages are properly tracked.
+ *
+ * The application may now run fsync() to
+ * write out the dirty pages and wait on the writeout and check the result.
+ * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
+ * async writeout immediately.
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * applications.
+ */
+SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
+{
+ unsigned long end;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ start = untagged_addr(start);
+
+ if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
+ goto out;
+ if (offset_in_page(start))
+ goto out;
+ if ((flags & MS_ASYNC) && (flags & MS_SYNC))
+ goto out;
+ error = -ENOMEM;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+ error = 0;
+ if (end == start)
+ goto out;
+ /*
+ * If the interval [start,end) covers some unmapped address ranges,
+ * just ignore them, but return -ENOMEM at the end.
+ */
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
+ for (;;) {
+ struct file *file;
+ loff_t fstart, fend;
+
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ start = vma->vm_start;
+ if (start >= end)
+ goto out_unlock;
+ unmapped_error = -ENOMEM;
+ }
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if ((flags & MS_INVALIDATE) &&
+ (vma->vm_flags & VM_LOCKED)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+ file = vma->vm_file;
+ fstart = (start - vma->vm_start) +
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ fend = fstart + (min(end, vma->vm_end) - start) - 1;
+ start = vma->vm_end;
+ if ((flags & MS_SYNC) && file &&
+ (vma->vm_flags & VM_SHARED)) {
+ get_file(file);
+ mmap_read_unlock(mm);
+ error = vfs_fsync_range(file, fstart, fend, 1);
+ fput(file);
+ if (error || start >= end)
+ goto out;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
+ } else {
+ if (start >= end) {
+ error = 0;
+ goto out_unlock;
+ }
+ vma = vma->vm_next;
+ }
+ }
+out_unlock:
+ mmap_read_unlock(mm);
+out:
+ return error ? : unmapped_error;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
new file mode 100644
index 000000000..0faf39b32
--- /dev/null
+++ b/mm/nommu.c
@@ -0,0 +1,1853 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/nommu.c
+ *
+ * Replacement code for mm functions to support CPU's that don't
+ * have any form of memory management unit (thus no virtual memory).
+ *
+ * See Documentation/admin-guide/mm/nommu-mmap.rst
+ *
+ * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
+ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
+ * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
+ * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/vmacache.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/compiler.h>
+#include <linux/mount.h>
+#include <linux/personality.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/printk.h>
+
+#include <linux/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include "internal.h"
+
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+struct page *mem_map;
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+unsigned long highest_memmap_pfn;
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+int heap_stack_gap = 0;
+
+atomic_long_t mmap_pages_allocated;
+
+EXPORT_SYMBOL(mem_map);
+
+/* list of mapped, potentially shareable regions */
+static struct kmem_cache *vm_region_jar;
+struct rb_root nommu_region_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_region_sem);
+
+const struct vm_operations_struct generic_file_vm_ops = {
+};
+
+/*
+ * Return the total memory allocated for this pointer, not
+ * just what the caller asked for.
+ *
+ * Doesn't have to be accurate, i.e. may have races.
+ */
+unsigned int kobjsize(const void *objp)
+{
+ struct page *page;
+
+ /*
+ * If the object we have should not have ksize performed on it,
+ * return size of 0
+ */
+ if (!objp || !virt_addr_valid(objp))
+ return 0;
+
+ page = virt_to_head_page(objp);
+
+ /*
+ * If the allocator sets PageSlab, we know the pointer came from
+ * kmalloc().
+ */
+ if (PageSlab(page))
+ return ksize(objp);
+
+ /*
+ * If it's not a compound page, see if we have a matching VMA
+ * region. This test is intentionally done in reverse order,
+ * so if there's no VMA, we still fall through and hand back
+ * PAGE_SIZE for 0-order pages.
+ */
+ if (!PageCompound(page)) {
+ struct vm_area_struct *vma;
+
+ vma = find_vma(current->mm, (unsigned long)objp);
+ if (vma)
+ return vma->vm_end - vma->vm_start;
+ }
+
+ /*
+ * The ksize() function is only guaranteed to work for pointers
+ * returned by kmalloc(). So handle arbitrary pointers here.
+ */
+ return page_size(page);
+}
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ unsigned long *pfn)
+{
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ *pfn = address >> PAGE_SHIFT;
+ return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+LIST_HEAD(vmap_area_list);
+
+void vfree(const void *addr)
+{
+ kfree(addr);
+}
+EXPORT_SYMBOL(vfree);
+
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+{
+ /*
+ * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
+ * returns only a logical address.
+ */
+ return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(__vmalloc);
+
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
+{
+ return __vmalloc(size, gfp_mask);
+}
+
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+ int node, const void *caller)
+{
+ return __vmalloc(size, gfp_mask);
+}
+
+static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
+{
+ void *ret;
+
+ ret = __vmalloc(size, flags);
+ if (ret) {
+ struct vm_area_struct *vma;
+
+ mmap_write_lock(current->mm);
+ vma = find_vma(current->mm, (unsigned long)ret);
+ if (vma)
+ vma->vm_flags |= VM_USERMAP;
+ mmap_write_unlock(current->mm);
+ }
+
+ return ret;
+}
+
+void *vmalloc_user(unsigned long size)
+{
+ return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+struct page *vmalloc_to_page(const void *addr)
+{
+ return virt_to_page(addr);
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+unsigned long vmalloc_to_pfn(const void *addr)
+{
+ return page_to_pfn(virt_to_page(addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+long vread(char *buf, char *addr, unsigned long count)
+{
+ /* Don't allow overflow */
+ if ((unsigned long) buf + count < count)
+ count = -(unsigned long) buf;
+
+ memcpy(buf, addr, count);
+ return count;
+}
+
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+
+ memcpy(addr, buf, count);
+ return count;
+}
+
+/*
+ * vmalloc - allocate virtually contiguous memory
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(vmalloc);
+
+/*
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+ return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
+
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
+/**
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
+ */
+void *vmalloc_32(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc_32);
+
+/**
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
+ * @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ /*
+ * We'll have to sort out the ZONE_DMA bits for 64-bit,
+ * but for now this can simply use vmalloc_user() directly.
+ */
+ return vmalloc_user(size);
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
+void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL(vmap);
+
+void vunmap(const void *addr)
+{
+ BUG();
+}
+EXPORT_SYMBOL(vunmap);
+
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+ BUG();
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+void vm_unmap_aliases(void)
+{
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+void free_vm_area(struct vm_struct *area)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_page);
+
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
+/*
+ * sys_brk() for the most part doesn't need the global kernel
+ * lock, except when an application is doing something nasty
+ * like trying to un-brk an area that has already been mapped
+ * to a regular file. in this case, the unmapping will need
+ * to invoke file system routines that need the global lock.
+ */
+SYSCALL_DEFINE1(brk, unsigned long, brk)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (brk < mm->start_brk || brk > mm->context.end_brk)
+ return mm->brk;
+
+ if (mm->brk == brk)
+ return mm->brk;
+
+ /*
+ * Always allow shrinking brk
+ */
+ if (brk <= mm->brk) {
+ mm->brk = brk;
+ return brk;
+ }
+
+ /*
+ * Ok, looks good - let it rip.
+ */
+ flush_icache_user_range(mm->brk, brk);
+ return mm->brk = brk;
+}
+
+/*
+ * initialise the percpu counter for VM and region record slabs
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
+ VM_BUG_ON(ret);
+ vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+}
+
+/*
+ * validate the region tree
+ * - the caller must hold the region lock
+ */
+#ifdef CONFIG_DEBUG_NOMMU_REGIONS
+static noinline void validate_nommu_regions(void)
+{
+ struct vm_region *region, *last;
+ struct rb_node *p, *lastp;
+
+ lastp = rb_first(&nommu_region_tree);
+ if (!lastp)
+ return;
+
+ last = rb_entry(lastp, struct vm_region, vm_rb);
+ BUG_ON(last->vm_end <= last->vm_start);
+ BUG_ON(last->vm_top < last->vm_end);
+
+ while ((p = rb_next(lastp))) {
+ region = rb_entry(p, struct vm_region, vm_rb);
+ last = rb_entry(lastp, struct vm_region, vm_rb);
+
+ BUG_ON(region->vm_end <= region->vm_start);
+ BUG_ON(region->vm_top < region->vm_end);
+ BUG_ON(region->vm_start < last->vm_top);
+
+ lastp = p;
+ }
+}
+#else
+static void validate_nommu_regions(void)
+{
+}
+#endif
+
+/*
+ * add a region into the global tree
+ */
+static void add_nommu_region(struct vm_region *region)
+{
+ struct vm_region *pregion;
+ struct rb_node **p, *parent;
+
+ validate_nommu_regions();
+
+ parent = NULL;
+ p = &nommu_region_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ pregion = rb_entry(parent, struct vm_region, vm_rb);
+ if (region->vm_start < pregion->vm_start)
+ p = &(*p)->rb_left;
+ else if (region->vm_start > pregion->vm_start)
+ p = &(*p)->rb_right;
+ else if (pregion == region)
+ return;
+ else
+ BUG();
+ }
+
+ rb_link_node(&region->vm_rb, parent, p);
+ rb_insert_color(&region->vm_rb, &nommu_region_tree);
+
+ validate_nommu_regions();
+}
+
+/*
+ * delete a region from the global tree
+ */
+static void delete_nommu_region(struct vm_region *region)
+{
+ BUG_ON(!nommu_region_tree.rb_node);
+
+ validate_nommu_regions();
+ rb_erase(&region->vm_rb, &nommu_region_tree);
+ validate_nommu_regions();
+}
+
+/*
+ * free a contiguous series of pages
+ */
+static void free_page_series(unsigned long from, unsigned long to)
+{
+ for (; from < to; from += PAGE_SIZE) {
+ struct page *page = virt_to_page(from);
+
+ atomic_long_dec(&mmap_pages_allocated);
+ put_page(page);
+ }
+}
+
+/*
+ * release a reference to a region
+ * - the caller must hold the region semaphore for writing, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_top
+ * will equal vm_start
+ */
+static void __put_nommu_region(struct vm_region *region)
+ __releases(nommu_region_sem)
+{
+ BUG_ON(!nommu_region_tree.rb_node);
+
+ if (--region->vm_usage == 0) {
+ if (region->vm_top > region->vm_start)
+ delete_nommu_region(region);
+ up_write(&nommu_region_sem);
+
+ if (region->vm_file)
+ fput(region->vm_file);
+
+ /* IO memory and memory shared directly out of the pagecache
+ * from ramfs/tmpfs mustn't be released here */
+ if (region->vm_flags & VM_MAPPED_COPY)
+ free_page_series(region->vm_start, region->vm_top);
+ kmem_cache_free(vm_region_jar, region);
+ } else {
+ up_write(&nommu_region_sem);
+ }
+}
+
+/*
+ * release a reference to a region
+ */
+static void put_nommu_region(struct vm_region *region)
+{
+ down_write(&nommu_region_sem);
+ __put_nommu_region(region);
+}
+
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *pvma, *prev;
+ struct address_space *mapping;
+ struct rb_node **p, *parent, *rb_prev;
+
+ BUG_ON(!vma->vm_region);
+
+ mm->map_count++;
+ vma->vm_mm = mm;
+
+ /* add the VMA to the mapping */
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+
+ i_mmap_lock_write(mapping);
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ /* add the VMA to the tree */
+ parent = rb_prev = NULL;
+ p = &mm->mm_rb.rb_node;
+ while (*p) {
+ parent = *p;
+ pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
+
+ /* sort by: start addr, end addr, VMA struct addr in that order
+ * (the latter is necessary as we may get identical VMAs) */
+ if (vma->vm_start < pvma->vm_start)
+ p = &(*p)->rb_left;
+ else if (vma->vm_start > pvma->vm_start) {
+ rb_prev = parent;
+ p = &(*p)->rb_right;
+ } else if (vma->vm_end < pvma->vm_end)
+ p = &(*p)->rb_left;
+ else if (vma->vm_end > pvma->vm_end) {
+ rb_prev = parent;
+ p = &(*p)->rb_right;
+ } else if (vma < pvma)
+ p = &(*p)->rb_left;
+ else if (vma > pvma) {
+ rb_prev = parent;
+ p = &(*p)->rb_right;
+ } else
+ BUG();
+ }
+
+ rb_link_node(&vma->vm_rb, parent, p);
+ rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+
+ /* add VMA to the VMA list also */
+ prev = NULL;
+ if (rb_prev)
+ prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+
+ __vma_link_list(mm, vma, prev);
+}
+
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static void delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ int i;
+ struct address_space *mapping;
+ struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
+
+ mm->map_count--;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache.vmas[i] == vma) {
+ vmacache_invalidate(mm);
+ break;
+ }
+ }
+
+ /* remove the VMA from the mapping */
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+
+ i_mmap_lock_write(mapping);
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ /* remove from the MM's tree and list */
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+
+ __vma_unlink_list(mm, vma);
+}
+
+/*
+ * destroy a VMA record
+ */
+static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ put_nommu_region(vma->vm_region);
+ vm_area_free(vma);
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_lock at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ /* check the cache first */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+ * resides) */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end > addr) {
+ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return -ENOMEM;
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_lock at least held readlocked
+ */
+static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ unsigned long end = addr + len;
+
+ /* check the cache first */
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+ * resides) */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_start < addr)
+ continue;
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end == end) {
+ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * determine whether a mapping should be permitted and, if so, what sort of
+ * mapping we're capable of supporting
+ */
+static int validate_mmap_request(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff,
+ unsigned long *_capabilities)
+{
+ unsigned long capabilities, rlen;
+ int ret;
+
+ /* do the simple checks first */
+ if (flags & MAP_FIXED)
+ return -EINVAL;
+
+ if ((flags & MAP_TYPE) != MAP_PRIVATE &&
+ (flags & MAP_TYPE) != MAP_SHARED)
+ return -EINVAL;
+
+ if (!len)
+ return -EINVAL;
+
+ /* Careful about overflows.. */
+ rlen = PAGE_ALIGN(len);
+ if (!rlen || rlen > TASK_SIZE)
+ return -ENOMEM;
+
+ /* offset overflow? */
+ if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
+ return -EOVERFLOW;
+
+ if (file) {
+ /* files must support mmap */
+ if (!file->f_op->mmap)
+ return -ENODEV;
+
+ /* work out if what we've got could possibly be shared
+ * - we support chardevs that provide their own "memory"
+ * - we support files/blockdevs that are memory backed
+ */
+ if (file->f_op->mmap_capabilities) {
+ capabilities = file->f_op->mmap_capabilities(file);
+ } else {
+ /* no explicit capabilities set, so assume some
+ * defaults */
+ switch (file_inode(file)->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFBLK:
+ capabilities = NOMMU_MAP_COPY;
+ break;
+
+ case S_IFCHR:
+ capabilities =
+ NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* eliminate any capabilities that we can't support on this
+ * device */
+ if (!file->f_op->get_unmapped_area)
+ capabilities &= ~NOMMU_MAP_DIRECT;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ capabilities &= ~NOMMU_MAP_COPY;
+
+ /* The file shall have been opened with read permission. */
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+
+ if (flags & MAP_SHARED) {
+ /* do checks for writing, appending and locking */
+ if ((prot & PROT_WRITE) &&
+ !(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (IS_APPEND(file_inode(file)) &&
+ (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (locks_verify_locked(file))
+ return -EAGAIN;
+
+ if (!(capabilities & NOMMU_MAP_DIRECT))
+ return -ENODEV;
+
+ /* we mustn't privatise shared mappings */
+ capabilities &= ~NOMMU_MAP_COPY;
+ } else {
+ /* we're going to read the file into private memory we
+ * allocate */
+ if (!(capabilities & NOMMU_MAP_COPY))
+ return -ENODEV;
+
+ /* we don't permit a private writable mapping to be
+ * shared with the backing device */
+ if (prot & PROT_WRITE)
+ capabilities &= ~NOMMU_MAP_DIRECT;
+ }
+
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
+ ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+ ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
+ ) {
+ capabilities &= ~NOMMU_MAP_DIRECT;
+ if (flags & MAP_SHARED) {
+ pr_warn("MAP_SHARED not completely supported on !MMU\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ /* handle executable mappings and implied executable
+ * mappings */
+ if (path_noexec(&file->f_path)) {
+ if (prot & PROT_EXEC)
+ return -EPERM;
+ } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ /* handle implication of PROT_EXEC by PROT_READ */
+ if (current->personality & READ_IMPLIES_EXEC) {
+ if (capabilities & NOMMU_MAP_EXEC)
+ prot |= PROT_EXEC;
+ }
+ } else if ((prot & PROT_READ) &&
+ (prot & PROT_EXEC) &&
+ !(capabilities & NOMMU_MAP_EXEC)
+ ) {
+ /* backing file is not executable, try to copy */
+ capabilities &= ~NOMMU_MAP_DIRECT;
+ }
+ } else {
+ /* anonymous mappings are always memory backed and can be
+ * privately mapped
+ */
+ capabilities = NOMMU_MAP_COPY;
+
+ /* handle PROT_EXEC implication by PROT_READ */
+ if ((prot & PROT_READ) &&
+ (current->personality & READ_IMPLIES_EXEC))
+ prot |= PROT_EXEC;
+ }
+
+ /* allow the security API to have its say */
+ ret = security_mmap_addr(addr);
+ if (ret < 0)
+ return ret;
+
+ /* looks okay */
+ *_capabilities = capabilities;
+ return 0;
+}
+
+/*
+ * we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags
+ */
+static unsigned long determine_vm_flags(struct file *file,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long capabilities)
+{
+ unsigned long vm_flags;
+
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
+ /* vm_flags |= mm->def_flags; */
+
+ if (!(capabilities & NOMMU_MAP_DIRECT)) {
+ /* attempt to share read-only copies of mapped file chunks */
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ if (file && !(prot & PROT_WRITE))
+ vm_flags |= VM_MAYSHARE;
+ } else {
+ /* overlay a shareable mapping on the backing device or inode
+ * if possible - used for chardevs, ramfs/tmpfs/shmfs and
+ * romfs/cramfs */
+ vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
+ if (flags & MAP_SHARED)
+ vm_flags |= VM_SHARED;
+ }
+
+ /* refuse to let anyone share private mappings with this process if
+ * it's being traced - otherwise breakpoints set in it may interfere
+ * with another untraced process
+ */
+ if ((flags & MAP_PRIVATE) && current->ptrace)
+ vm_flags &= ~VM_MAYSHARE;
+
+ return vm_flags;
+}
+
+/*
+ * set up a shared mapping on a file (the driver or filesystem provides and
+ * pins the storage)
+ */
+static int do_mmap_shared_file(struct vm_area_struct *vma)
+{
+ int ret;
+
+ ret = call_mmap(vma->vm_file, vma);
+ if (ret == 0) {
+ vma->vm_region->vm_top = vma->vm_region->vm_end;
+ return 0;
+ }
+ if (ret != -ENOSYS)
+ return ret;
+
+ /* getting -ENOSYS indicates that direct mmap isn't possible (as
+ * opposed to tried but failed) so we can only give a suitable error as
+ * it's not possible to make a private copy if MAP_SHARED was given */
+ return -ENODEV;
+}
+
+/*
+ * set up a private mapping or an anonymous shared mapping
+ */
+static int do_mmap_private(struct vm_area_struct *vma,
+ struct vm_region *region,
+ unsigned long len,
+ unsigned long capabilities)
+{
+ unsigned long total, point;
+ void *base;
+ int ret, order;
+
+ /* invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory
+ * - VM_MAYSHARE will be set if it may attempt to share
+ */
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ ret = call_mmap(vma->vm_file, vma);
+ if (ret == 0) {
+ /* shouldn't return success if we're not sharing */
+ BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
+ vma->vm_region->vm_top = vma->vm_region->vm_end;
+ return 0;
+ }
+ if (ret != -ENOSYS)
+ return ret;
+
+ /* getting an ENOSYS error indicates that direct mmap isn't
+ * possible (as opposed to tried but failed) so we'll try to
+ * make a private copy of the data and map that instead */
+ }
+
+
+ /* allocate some memory to hold the mapping
+ * - note that this may not return a page-aligned address if the object
+ * we're allocating is smaller than a page
+ */
+ order = get_order(len);
+ total = 1 << order;
+ point = len >> PAGE_SHIFT;
+
+ /* we don't want to allocate a power-of-2 sized page set */
+ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
+ total = point;
+
+ base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
+ if (!base)
+ goto enomem;
+
+ atomic_long_add(total, &mmap_pages_allocated);
+
+ region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+ region->vm_start = (unsigned long) base;
+ region->vm_end = region->vm_start + len;
+ region->vm_top = region->vm_start + (total << PAGE_SHIFT);
+
+ vma->vm_start = region->vm_start;
+ vma->vm_end = region->vm_start + len;
+
+ if (vma->vm_file) {
+ /* read the contents of a file into the copy */
+ loff_t fpos;
+
+ fpos = vma->vm_pgoff;
+ fpos <<= PAGE_SHIFT;
+
+ ret = kernel_read(vma->vm_file, base, len, &fpos);
+ if (ret < 0)
+ goto error_free;
+
+ /* clear the last little bit */
+ if (ret < len)
+ memset(base + ret, 0, len - ret);
+
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ return 0;
+
+error_free:
+ free_page_series(region->vm_start, region->vm_top);
+ region->vm_start = vma->vm_start = 0;
+ region->vm_end = vma->vm_end = 0;
+ region->vm_top = 0;
+ return ret;
+
+enomem:
+ pr_err("Allocation of length %lu from process %d (%s) failed\n",
+ len, current->pid, current->comm);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
+unsigned long do_mmap(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long prot,
+ unsigned long flags,
+ unsigned long pgoff,
+ unsigned long *populate,
+ struct list_head *uf)
+{
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ struct rb_node *rb;
+ vm_flags_t vm_flags;
+ unsigned long capabilities, result;
+ int ret;
+
+ *populate = 0;
+
+ /* decide whether we should attempt the mapping, and if so what sort of
+ * mapping */
+ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
+ &capabilities);
+ if (ret < 0)
+ return ret;
+
+ /* we ignore the address hint */
+ addr = 0;
+ len = PAGE_ALIGN(len);
+
+ /* we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags */
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
+ /* we're going to need to record the mapping */
+ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
+ if (!region)
+ goto error_getting_region;
+
+ vma = vm_area_alloc(current->mm);
+ if (!vma)
+ goto error_getting_vma;
+
+ region->vm_usage = 1;
+ region->vm_flags = vm_flags;
+ region->vm_pgoff = pgoff;
+
+ vma->vm_flags = vm_flags;
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ region->vm_file = get_file(file);
+ vma->vm_file = get_file(file);
+ }
+
+ down_write(&nommu_region_sem);
+
+ /* if we want to share, we need to check for regions created by other
+ * mmap() calls that overlap with our proposed mapping
+ * - we can only share with a superset match on most regular files
+ * - shared mappings on character devices and memory backed files are
+ * permitted to overlap inexactly as far as we are concerned for in
+ * these cases, sharing is handled in the driver or filesystem rather
+ * than here
+ */
+ if (vm_flags & VM_MAYSHARE) {
+ struct vm_region *pregion;
+ unsigned long pglen, rpglen, pgend, rpgend, start;
+
+ pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgend = pgoff + pglen;
+
+ for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
+ pregion = rb_entry(rb, struct vm_region, vm_rb);
+
+ if (!(pregion->vm_flags & VM_MAYSHARE))
+ continue;
+
+ /* search for overlapping mappings on the same file */
+ if (file_inode(pregion->vm_file) !=
+ file_inode(file))
+ continue;
+
+ if (pregion->vm_pgoff >= pgend)
+ continue;
+
+ rpglen = pregion->vm_end - pregion->vm_start;
+ rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ rpgend = pregion->vm_pgoff + rpglen;
+ if (pgoff >= rpgend)
+ continue;
+
+ /* handle inexactly overlapping matches between
+ * mappings */
+ if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
+ !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
+ /* new mapping is not a subset of the region */
+ if (!(capabilities & NOMMU_MAP_DIRECT))
+ goto sharing_violation;
+ continue;
+ }
+
+ /* we've found a region we can share */
+ pregion->vm_usage++;
+ vma->vm_region = pregion;
+ start = pregion->vm_start;
+ start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
+ vma->vm_start = start;
+ vma->vm_end = start + len;
+
+ if (pregion->vm_flags & VM_MAPPED_COPY)
+ vma->vm_flags |= VM_MAPPED_COPY;
+ else {
+ ret = do_mmap_shared_file(vma);
+ if (ret < 0) {
+ vma->vm_region = NULL;
+ vma->vm_start = 0;
+ vma->vm_end = 0;
+ pregion->vm_usage--;
+ pregion = NULL;
+ goto error_just_free;
+ }
+ }
+ fput(region->vm_file);
+ kmem_cache_free(vm_region_jar, region);
+ region = pregion;
+ result = start;
+ goto share;
+ }
+
+ /* obtain the address at which to make a shared mapping
+ * - this is the hook for quasi-memory character devices to
+ * tell us the location of a shared mapping
+ */
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ addr = file->f_op->get_unmapped_area(file, addr, len,
+ pgoff, flags);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ if (ret != -ENOSYS)
+ goto error_just_free;
+
+ /* the driver refused to tell us where to site
+ * the mapping so we'll have to attempt to copy
+ * it */
+ ret = -ENODEV;
+ if (!(capabilities & NOMMU_MAP_COPY))
+ goto error_just_free;
+
+ capabilities &= ~NOMMU_MAP_DIRECT;
+ } else {
+ vma->vm_start = region->vm_start = addr;
+ vma->vm_end = region->vm_end = addr + len;
+ }
+ }
+ }
+
+ vma->vm_region = region;
+
+ /* set up the mapping
+ * - the region is filled in if NOMMU_MAP_DIRECT is still set
+ */
+ if (file && vma->vm_flags & VM_SHARED)
+ ret = do_mmap_shared_file(vma);
+ else
+ ret = do_mmap_private(vma, region, len, capabilities);
+ if (ret < 0)
+ goto error_just_free;
+ add_nommu_region(region);
+
+ /* clear anonymous mappings that don't ask for uninitialized data */
+ if (!vma->vm_file &&
+ (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+ !(flags & MAP_UNINITIALIZED)))
+ memset((void *)region->vm_start, 0,
+ region->vm_end - region->vm_start);
+
+ /* okay... we have a mapping; now we have to register it */
+ result = vma->vm_start;
+
+ current->mm->total_vm += len >> PAGE_SHIFT;
+
+share:
+ add_vma_to_mm(current->mm, vma);
+
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_user_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }
+
+ up_write(&nommu_region_sem);
+
+ return result;
+
+error_just_free:
+ up_write(&nommu_region_sem);
+error:
+ if (region->vm_file)
+ fput(region->vm_file);
+ kmem_cache_free(vm_region_jar, region);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ return ret;
+
+sharing_violation:
+ up_write(&nommu_region_sem);
+ pr_warn("Attempt to share mismatched mappings\n");
+ ret = -EINVAL;
+ goto error;
+
+error_getting_vma:
+ kmem_cache_free(vm_region_jar, region);
+ pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
+ len, current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
+error_getting_region:
+ pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
+ len, current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+}
+
+unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval = -EBADF;
+
+ audit_mmap_fd(fd, flags);
+ if (!(flags & MAP_ANONYMOUS)) {
+ file = fget(fd);
+ if (!file)
+ goto out;
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (offset_in_page(a.offset))
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
+/*
+ * split a vma into two pieces at address 'addr', a new vma is allocated either
+ * for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ struct vm_area_struct *new;
+ struct vm_region *region;
+ unsigned long npages;
+
+ /* we're only permitted to split anonymous regions (these should have
+ * only a single usage on the region) */
+ if (vma->vm_file)
+ return -ENOMEM;
+
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ new = vm_area_dup(vma);
+ if (!new) {
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
+ }
+
+ /* most fields are the same, copy all, and then fixup */
+ *region = *vma->vm_region;
+ new->vm_region = region;
+
+ npages = (addr - vma->vm_start) >> PAGE_SHIFT;
+
+ if (new_below) {
+ region->vm_top = region->vm_end = new->vm_end = addr;
+ } else {
+ region->vm_start = new->vm_start = addr;
+ region->vm_pgoff = new->vm_pgoff += npages;
+ }
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+ delete_vma_from_mm(vma);
+ down_write(&nommu_region_sem);
+ delete_nommu_region(vma->vm_region);
+ if (new_below) {
+ vma->vm_region->vm_start = vma->vm_start = addr;
+ vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
+ } else {
+ vma->vm_region->vm_end = vma->vm_end = addr;
+ vma->vm_region->vm_top = addr;
+ }
+ add_nommu_region(vma->vm_region);
+ add_nommu_region(new->vm_region);
+ up_write(&nommu_region_sem);
+ add_vma_to_mm(mm, vma);
+ add_vma_to_mm(mm, new);
+ return 0;
+}
+
+/*
+ * shrink a VMA by removing the specified chunk from either the beginning or
+ * the end
+ */
+static int shrink_vma(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long from, unsigned long to)
+{
+ struct vm_region *region;
+
+ /* adjust the VMA's pointers, which may reposition it in the MM's tree
+ * and list */
+ delete_vma_from_mm(vma);
+ if (from > vma->vm_start)
+ vma->vm_end = from;
+ else
+ vma->vm_start = to;
+ add_vma_to_mm(mm, vma);
+
+ /* cut the backing region down to size */
+ region = vma->vm_region;
+ BUG_ON(region->vm_usage != 1);
+
+ down_write(&nommu_region_sem);
+ delete_nommu_region(region);
+ if (from > region->vm_start) {
+ to = region->vm_top;
+ region->vm_top = region->vm_end = from;
+ } else {
+ region->vm_start = to;
+ }
+ add_nommu_region(region);
+ up_write(&nommu_region_sem);
+
+ free_page_series(from, to);
+ return 0;
+}
+
+/*
+ * release a mapping
+ * - under NOMMU conditions the chunk to be unmapped must be backed by a single
+ * VMA, though it need not cover the whole VMA
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+ int ret;
+
+ len = PAGE_ALIGN(len);
+ if (len == 0)
+ return -EINVAL;
+
+ end = start + len;
+
+ /* find the first potentially overlapping VMA */
+ vma = find_vma(mm, start);
+ if (!vma) {
+ static int limit;
+ if (limit < 5) {
+ pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
+ current->pid, current->comm,
+ start, start + len - 1);
+ limit++;
+ }
+ return -EINVAL;
+ }
+
+ /* we're allowed to split an anonymous VMA but not a file-backed one */
+ if (vma->vm_file) {
+ do {
+ if (start > vma->vm_start)
+ return -EINVAL;
+ if (end == vma->vm_end)
+ goto erase_whole_vma;
+ vma = vma->vm_next;
+ } while (vma);
+ return -EINVAL;
+ } else {
+ /* the chunk must be a subset of the VMA found */
+ if (start == vma->vm_start && end == vma->vm_end)
+ goto erase_whole_vma;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+ if (offset_in_page(start))
+ return -EINVAL;
+ if (end != vma->vm_end && offset_in_page(end))
+ return -EINVAL;
+ if (start != vma->vm_start && end != vma->vm_end) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret < 0)
+ return ret;
+ }
+ return shrink_vma(mm, vma, start, end);
+ }
+
+erase_whole_vma:
+ delete_vma_from_mm(vma);
+ delete_vma(mm, vma);
+ return 0;
+}
+EXPORT_SYMBOL(do_munmap);
+
+int vm_munmap(unsigned long addr, size_t len)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_write_lock(mm);
+ ret = do_munmap(mm, addr, len, NULL);
+ mmap_write_unlock(mm);
+ return ret;
+}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ return vm_munmap(addr, len);
+}
+
+/*
+ * release all the mappings made in a process's VM space
+ */
+void exit_mmap(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ if (!mm)
+ return;
+
+ mm->total_vm = 0;
+
+ while ((vma = mm->mmap)) {
+ mm->mmap = vma->vm_next;
+ delete_vma_from_mm(vma);
+ delete_vma(mm, vma);
+ cond_resched();
+ }
+}
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+ return -ENOMEM;
+}
+
+/*
+ * expand (or shrink) an existing mapping, potentially moving it at the same
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
+ * as long as it stays within the region allocated by do_mmap_private() and the
+ * block is not shareable
+ *
+ * MREMAP_FIXED is not supported under NOMMU conditions
+ */
+static unsigned long do_mremap(unsigned long addr,
+ unsigned long old_len, unsigned long new_len,
+ unsigned long flags, unsigned long new_addr)
+{
+ struct vm_area_struct *vma;
+
+ /* insanity checks first */
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
+ if (old_len == 0 || new_len == 0)
+ return (unsigned long) -EINVAL;
+
+ if (offset_in_page(addr))
+ return -EINVAL;
+
+ if (flags & MREMAP_FIXED && new_addr != addr)
+ return (unsigned long) -EINVAL;
+
+ vma = find_vma_exact(current->mm, addr, old_len);
+ if (!vma)
+ return (unsigned long) -EINVAL;
+
+ if (vma->vm_end != vma->vm_start + old_len)
+ return (unsigned long) -EFAULT;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ return (unsigned long) -EPERM;
+
+ if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
+ return (unsigned long) -ENOMEM;
+
+ /* all checks complete - do it */
+ vma->vm_end = vma->vm_start + new_len;
+ return vma->vm_start;
+}
+
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ unsigned long ret;
+
+ mmap_write_lock(current->mm);
+ ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+ mmap_write_unlock(current->mm);
+ return ret;
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ return NULL;
+}
+
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+ return 0;
+}
+EXPORT_SYMBOL(remap_pfn_range);
+
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+ unsigned long pfn = start >> PAGE_SHIFT;
+ unsigned long vm_len = vma->vm_end - vma->vm_start;
+
+ pfn += vma->vm_pgoff;
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ unsigned int size = vma->vm_end - vma->vm_start;
+
+ if (!(vma->vm_flags & VM_USERMAP))
+ return -EINVAL;
+
+ vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+ vma->vm_end = vma->vm_start + size;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
+unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ return -ENOMEM;
+}
+
+vm_fault_t filemap_fault(struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+void filemap_map_pages(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, unsigned int gup_flags)
+{
+ struct vm_area_struct *vma;
+ int write = gup_flags & FOLL_WRITE;
+
+ if (mmap_read_lock_killable(mm))
+ return 0;
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (vma) {
+ /* don't overrun this mapping */
+ if (addr + len >= vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read or write mappings where it is permitted */
+ if (write && vma->vm_flags & VM_MAYWRITE)
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
+ else if (!write && vma->vm_flags & VM_MAYREAD)
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
+ else
+ len = 0;
+ } else {
+ len = 0;
+ }
+
+ mmap_read_unlock(mm);
+
+ return len;
+}
+
+/**
+ * access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
+ unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+ return len;
+}
+EXPORT_SYMBOL_GPL(access_process_vm);
+
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend beyond so that do_mmap() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+ size_t newsize)
+{
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ pgoff_t low, high;
+ size_t r_size, r_top;
+
+ low = newsize >> PAGE_SHIFT;
+ high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ down_write(&nommu_region_sem);
+ i_mmap_lock_read(inode->i_mapping);
+
+ /* search for VMAs that fall within the dead zone */
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
+ /* found one - only interested if it's shared out of the page
+ * cache */
+ if (vma->vm_flags & VM_SHARED) {
+ i_mmap_unlock_read(inode->i_mapping);
+ up_write(&nommu_region_sem);
+ return -ETXTBSY; /* not quite true, but near enough */
+ }
+ }
+
+ /* reduce any regions that overlap the dead zone - if in existence,
+ * these will be pointed to by VMAs that don't overlap the dead zone
+ *
+ * we don't check for any regions that start beyond the EOF as there
+ * shouldn't be any
+ */
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ region = vma->vm_region;
+ r_size = region->vm_top - region->vm_start;
+ r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+
+ if (r_top > newsize) {
+ region->vm_top -= r_top - newsize;
+ if (region->vm_end > region->vm_top)
+ region->vm_end = region->vm_top;
+ }
+ }
+
+ i_mmap_unlock_read(inode->i_mapping);
+ up_write(&nommu_region_sem);
+ return 0;
+}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+subsys_initcall(init_user_reserve);
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+subsys_initcall(init_admin_reserve);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 000000000..3d7c557fb
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,1165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/oom_kill.c
+ *
+ * Copyright (C) 1998,2000 Rik van Riel
+ * Thanks go out to Claus Fischer for some serious inspiration and
+ * for goading me into coding this file...
+ * Copyright (C) 2010 Google, Inc.
+ * Rewritten by David Rientjes
+ *
+ * The routines in this file are used to kill a process when
+ * we're seriously out of memory. This gets called from __alloc_pages()
+ * in mm/page_alloc.c when we really run out of memory.
+ *
+ * Since we won't call these routines often (on a well-configured
+ * machine) this file will double as a 'coding guide' and a signpost
+ * for newbie kernel hackers. It features several pointers to major
+ * kernel subsystems and hints as to where to find out what things do.
+ */
+
+#include <linux/oom.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
+#include <linux/swap.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/cpuset.h>
+#include <linux/export.h>
+#include <linux/notifier.h>
+#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
+#include "slab.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
+
+int sysctl_panic_on_oom;
+int sysctl_oom_kill_allocating_task;
+int sysctl_oom_dump_tasks = 1;
+
+/*
+ * Serializes oom killer invocations (out_of_memory()) from all contexts to
+ * prevent from over eager oom killing (e.g. when the oom killer is invoked
+ * from different domains).
+ *
+ * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
+ * and mark_oom_victim
+ */
+DEFINE_MUTEX(oom_lock);
+/* Serializes oom_score_adj and oom_score_adj_min updates */
+DEFINE_MUTEX(oom_adj_mutex);
+
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+ return oc->memcg != NULL;
+}
+
+#ifdef CONFIG_NUMA
+/**
+ * oom_cpuset_eligible() - check task eligiblity for kill
+ * @start: task struct of which task to consider
+ * @oc: pointer to struct oom_control
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ *
+ * This function is assuming oom-killer context and 'current' has triggered
+ * the oom-killer.
+ */
+static bool oom_cpuset_eligible(struct task_struct *start,
+ struct oom_control *oc)
+{
+ struct task_struct *tsk;
+ bool ret = false;
+ const nodemask_t *mask = oc->nodemask;
+
+ if (is_memcg_oom(oc))
+ return true;
+
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
+ if (mask) {
+ /*
+ * If this is a mempolicy constrained oom, tsk's
+ * cpuset is irrelevant. Only return true if its
+ * mempolicy intersects current, otherwise it may be
+ * needlessly killed.
+ */
+ ret = mempolicy_nodemask_intersects(tsk, mask);
+ } else {
+ /*
+ * This is not a mempolicy constrained oom, so only
+ * check the mems of tsk's cpuset.
+ */
+ ret = cpuset_mems_allowed_intersects(current, tsk);
+ }
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+#else
+static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * kthread_use_mm(), but one or more of its subthreads may still have a valid
+ * pointer. Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
+ task_lock(t);
+ if (likely(t->mm))
+ goto found;
+ task_unlock(t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
+
+ return t;
+}
+
+/*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+static inline bool is_sysrq_oom(struct oom_control *oc)
+{
+ return oc->order == -1;
+}
+
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p)
+{
+ if (is_global_init(p))
+ return true;
+ if (p->flags & PF_KTHREAD)
+ return true;
+ return false;
+}
+
+/*
+ * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
+ * than all user memory (LRU pages)
+ */
+static bool is_dump_unreclaim_slabs(void)
+{
+ unsigned long nr_lru;
+
+ nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
+ global_node_page_state(NR_INACTIVE_ANON) +
+ global_node_page_state(NR_ACTIVE_FILE) +
+ global_node_page_state(NR_INACTIVE_FILE) +
+ global_node_page_state(NR_ISOLATED_ANON) +
+ global_node_page_state(NR_ISOLATED_FILE) +
+ global_node_page_state(NR_UNEVICTABLE);
+
+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
+}
+
+/**
+ * oom_badness - heuristic function to determine which candidate task to kill
+ * @p: task struct of which task we should calculate
+ * @totalpages: total present RAM allowed for page allocation
+ *
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible. The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
+ */
+long oom_badness(struct task_struct *p, unsigned long totalpages)
+{
+ long points;
+ long adj;
+
+ if (oom_unkillable_task(p))
+ return LONG_MIN;
+
+ p = find_lock_task_mm(p);
+ if (!p)
+ return LONG_MIN;
+
+ /*
+ * Do not even consider tasks which are explicitly marked oom
+ * unkillable or have been already oom reaped or the are in
+ * the middle of vfork
+ */
+ adj = (long)p->signal->oom_score_adj;
+ if (adj == OOM_SCORE_ADJ_MIN ||
+ test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
+ in_vfork(p)) {
+ task_unlock(p);
+ return LONG_MIN;
+ }
+
+ /*
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss, pagetable and swap space use.
+ */
+ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+ task_unlock(p);
+
+ /* Normalize to oom_score_adj units */
+ adj *= totalpages / 1000;
+ points += adj;
+
+ return points;
+}
+
+static const char * const oom_constraint_text[] = {
+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
+};
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static enum oom_constraint constrained_alloc(struct oom_control *oc)
+{
+ struct zone *zone;
+ struct zoneref *z;
+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
+ bool cpuset_limited = false;
+ int nid;
+
+ if (is_memcg_oom(oc)) {
+ oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
+ return CONSTRAINT_MEMCG;
+ }
+
+ /* Default to all available memory */
+ oc->totalpages = totalram_pages() + total_swap_pages;
+
+ if (!IS_ENABLED(CONFIG_NUMA))
+ return CONSTRAINT_NONE;
+
+ if (!oc->zonelist)
+ return CONSTRAINT_NONE;
+ /*
+ * Reach here only when __GFP_NOFAIL is used. So, we should avoid
+ * to kill current.We have to random task kill in this case.
+ * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
+ */
+ if (oc->gfp_mask & __GFP_THISNODE)
+ return CONSTRAINT_NONE;
+
+ /*
+ * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+ * the page allocator means a mempolicy is in effect. Cpuset policy
+ * is enforced in get_page_from_freelist().
+ */
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
+ oc->totalpages = total_swap_pages;
+ for_each_node_mask(nid, *oc->nodemask)
+ oc->totalpages += node_present_pages(nid);
+ return CONSTRAINT_MEMORY_POLICY;
+ }
+
+ /* Check this allocation failure is caused by cpuset's wall function */
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ highest_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
+ cpuset_limited = true;
+
+ if (cpuset_limited) {
+ oc->totalpages = total_swap_pages;
+ for_each_node_mask(nid, cpuset_current_mems_allowed)
+ oc->totalpages += node_present_pages(nid);
+ return CONSTRAINT_CPUSET;
+ }
+ return CONSTRAINT_NONE;
+}
+
+static int oom_evaluate_task(struct task_struct *task, void *arg)
+{
+ struct oom_control *oc = arg;
+ long points;
+
+ if (oom_unkillable_task(task))
+ goto next;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
+ goto next;
+
+ /*
+ * This task already has access to memory reserves and is being killed.
+ * Don't allow any other task to have access to the reserves unless
+ * the task has MMF_OOM_SKIP because chances that it would release
+ * any memory is quite low.
+ */
+ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
+ if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ goto next;
+ goto abort;
+ }
+
+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task)) {
+ points = LONG_MAX;
+ goto select;
+ }
+
+ points = oom_badness(task, oc->totalpages);
+ if (points == LONG_MIN || points < oc->chosen_points)
+ goto next;
+
+select:
+ if (oc->chosen)
+ put_task_struct(oc->chosen);
+ get_task_struct(task);
+ oc->chosen = task;
+ oc->chosen_points = points;
+next:
+ return 0;
+abort:
+ if (oc->chosen)
+ put_task_struct(oc->chosen);
+ oc->chosen = (void *)-1UL;
+ return 1;
+}
+
+/*
+ * Simple selection loop. We choose the process with the highest number of
+ * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ */
+static void select_bad_process(struct oom_control *oc)
+{
+ oc->chosen_points = LONG_MIN;
+
+ if (is_memcg_oom(oc))
+ mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
+ else {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p)
+ if (oom_evaluate_task(p, oc))
+ break;
+ rcu_read_unlock();
+ }
+}
+
+static int dump_task(struct task_struct *p, void *arg)
+{
+ struct oom_control *oc = arg;
+ struct task_struct *task;
+
+ if (oom_unkillable_task(p))
+ return 0;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ return 0;
+
+ task = find_lock_task_mm(p);
+ if (!task) {
+ /*
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
+ */
+ return 0;
+ }
+
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ mm_pgtables_bytes(task->mm),
+ get_mm_counter(task->mm, MM_SWAPENTS),
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+
+ return 0;
+}
+
+/**
+ * dump_tasks - dump current memory state of all system tasks
+ * @oc: pointer to struct oom_control
+ *
+ * Dumps the current memory state of all eligible tasks. Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
+ */
+static void dump_tasks(struct oom_control *oc)
+{
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+
+ if (is_memcg_oom(oc))
+ mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+ else {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p)
+ dump_task(p, oc);
+ rcu_read_unlock();
+ }
+}
+
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+ /* one line summary of the oom killer context. */
+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+ oom_constraint_text[oc->constraint],
+ nodemask_pr_args(oc->nodemask));
+ cpuset_print_current_mems_allowed();
+ mem_cgroup_print_oom_context(oc->memcg, victim);
+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+ from_kuid(&init_user_ns, task_uid(victim)));
+}
+
+static void dump_header(struct oom_control *oc, struct task_struct *p)
+{
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
+ current->signal->oom_score_adj);
+ if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
+ pr_warn("COMPACTION is disabled!!!\n");
+
+ dump_stack();
+ if (is_memcg_oom(oc))
+ mem_cgroup_print_oom_meminfo(oc->memcg);
+ else {
+ show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
+ if (is_dump_unreclaim_slabs())
+ dump_unreclaimable_slab();
+ }
+ if (sysctl_oom_dump_tasks)
+ dump_tasks(oc);
+ if (p)
+ dump_oom_summary(oc, p);
+}
+
+/*
+ * Number of OOM victims in flight
+ */
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
+
+static bool oom_killer_disabled __read_mostly;
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * task->mm can be NULL if the task is the exited group leader. So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm)
+ return t_mm == mm;
+ }
+ return false;
+}
+
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static struct task_struct *oom_reaper_list;
+static DEFINE_SPINLOCK(oom_reaper_lock);
+
+bool __oom_reap_task_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ bool ret = true;
+
+ /*
+ * Tell all users of get_user/copy_from_user etc... that the content
+ * is no longer stable. No barriers really needed because unmapping
+ * should imply barriers already and the reader would hit a page fault
+ * if it stumbled over a reaped memory.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
+
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (!can_madv_lru_vma(vma))
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ struct mmu_notifier_range range;
+ struct mmu_gather tlb;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
+ vma->vm_end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+ tlb_finish_mmu(&tlb, range.start, range.end);
+ ret = false;
+ continue;
+ }
+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
+static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
+{
+ bool ret = true;
+
+ if (!mmap_read_trylock(mm)) {
+ trace_skip_task_reaping(tsk->pid);
+ return false;
+ }
+
+ /*
+ * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
+ * work on the mm anymore. The check for MMF_OOM_SKIP must run
+ * under mmap_lock for reading because it serializes against the
+ * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
+ */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ trace_skip_task_reaping(tsk->pid);
+ goto out_unlock;
+ }
+
+ trace_start_task_reaping(tsk->pid);
+
+ /* failed to reap part of the address space. Try again later */
+ ret = __oom_reap_task_mm(mm);
+ if (!ret)
+ goto out_finish;
+
+ pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ task_pid_nr(tsk), tsk->comm,
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)));
+out_finish:
+ trace_finish_task_reaping(tsk->pid);
+out_unlock:
+ mmap_read_unlock(mm);
+
+ return ret;
+}
+
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+ int attempts = 0;
+ struct mm_struct *mm = tsk->signal->oom_mm;
+
+ /* Retry the mmap_read_trylock(mm) a few times */
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
+ schedule_timeout_idle(HZ/10);
+
+ if (attempts <= MAX_OOM_REAP_RETRIES ||
+ test_bit(MMF_OOM_SKIP, &mm->flags))
+ goto done;
+
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ sched_show_task(tsk);
+ debug_show_all_locks();
+
+done:
+ tsk->oom_reaper_list = NULL;
+
+ /*
+ * Hide this mm from OOM killer because it has been either reaped or
+ * somebody can't call mmap_write_unlock(mm).
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+
+ /* Drop a reference taken by queue_oom_reaper */
+ put_task_struct(tsk);
+}
+
+static int oom_reaper(void *unused)
+{
+ while (true) {
+ struct task_struct *tsk = NULL;
+
+ wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+ spin_lock_irq(&oom_reaper_lock);
+ if (oom_reaper_list != NULL) {
+ tsk = oom_reaper_list;
+ oom_reaper_list = tsk->oom_reaper_list;
+ }
+ spin_unlock_irq(&oom_reaper_lock);
+
+ if (tsk)
+ oom_reap_task(tsk);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct timer_list *timer)
+{
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
+
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
+
+ spin_lock_irqsave(&oom_reaper_lock, flags);
+ tsk->oom_reaper_list = oom_reaper_list;
+ oom_reaper_list = tsk;
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
+ trace_wake_reaper(tsk->pid);
+ wake_up(&oom_reaper_wait);
+}
+
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ return;
+
+ get_task_struct(tsk);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static inline void queue_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif /* CONFIG_MMU */
+
+/**
+ * mark_oom_victim - mark the given task as OOM victim
+ * @tsk: task to mark
+ *
+ * Has to be called with oom_lock held and never after
+ * oom has been disabled already.
+ *
+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
+ * under task_lock or operate on the current).
+ */
+static void mark_oom_victim(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+
+ WARN_ON(oom_killer_disabled);
+ /* OOM killer might race with memcg OOM */
+ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
+
+ /* oom_mm is bound to the signal struct life time. */
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+ mmgrab(tsk->signal->oom_mm);
+ set_bit(MMF_OOM_VICTIM, &mm->flags);
+ }
+
+ /*
+ * Make sure that the task is woken up from uninterruptible sleep
+ * if it is frozen because OOM killer wouldn't be able to free
+ * any memory and livelock. freezing_slow_path will tell the freezer
+ * that TIF_MEMDIE tasks should be ignored.
+ */
+ __thaw_task(tsk);
+ atomic_inc(&oom_victims);
+ trace_mark_victim(tsk->pid);
+}
+
+/**
+ * exit_oom_victim - note the exit of an OOM victim
+ */
+void exit_oom_victim(void)
+{
+ clear_thread_flag(TIF_MEMDIE);
+
+ if (!atomic_dec_return(&oom_victims))
+ wake_up_all(&oom_victims_wait);
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+ oom_killer_disabled = false;
+ pr_info("OOM killer enabled.\n");
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ * @timeout: maximum timeout to wait for oom victims in jiffies
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed or the given
+ * timeout expires.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(signed long timeout)
+{
+ signed long ret;
+
+ /*
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
+ */
+ if (mutex_lock_killable(&oom_lock))
+ return false;
+ oom_killer_disabled = true;
+ mutex_unlock(&oom_lock);
+
+ ret = wait_event_interruptible_timeout(oom_victims_wait,
+ !atomic_read(&oom_victims), timeout);
+ if (ret <= 0) {
+ oom_killer_enable();
+ return false;
+ }
+ pr_info("OOM killer disabled.\n");
+
+ return true;
+}
+
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+
+ /*
+ * A coredumping process may sleep for an extended period in exit_mm(),
+ * so the oom killer cannot assume that the process will promptly exit
+ * and release memory.
+ */
+ if (sig->flags & SIGNAL_GROUP_COREDUMP)
+ return false;
+
+ if (sig->flags & SIGNAL_GROUP_EXIT)
+ return true;
+
+ if (thread_group_empty(task) && (task->flags & PF_EXITING))
+ return true;
+
+ return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ * Caller has to make sure that task->mm is stable (hold task_lock or
+ * it operates on the current).
+ */
+static bool task_will_free_mem(struct task_struct *task)
+{
+ struct mm_struct *mm = task->mm;
+ struct task_struct *p;
+ bool ret = true;
+
+ /*
+ * Skip tasks without mm because it might have passed its exit_mm and
+ * exit_oom_victim. oom_reaper could have rescued that but do not rely
+ * on that for now. We can consider find_lock_task_mm in future.
+ */
+ if (!mm)
+ return false;
+
+ if (!__task_will_free_mem(task))
+ return false;
+
+ /*
+ * This task has already been drained by the oom reaper so there are
+ * only small chances it will free some more
+ */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags))
+ return false;
+
+ if (atomic_read(&mm->mm_users) <= 1)
+ return true;
+
+ /*
+ * Make sure that all tasks which share the mm with the given tasks
+ * are dying as well to make sure that a) nobody pins its mm and
+ * b) the task is also reapable by the oom reaper.
+ */
+ rcu_read_lock();
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(task, p))
+ continue;
+ ret = __task_will_free_mem(p);
+ if (!ret)
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void __oom_kill_process(struct task_struct *victim, const char *message)
+{
+ struct task_struct *p;
+ struct mm_struct *mm;
+ bool can_oom_reap = true;
+
+ p = find_lock_task_mm(victim);
+ if (!p) {
+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
+ message, task_pid_nr(victim), victim->comm);
+ put_task_struct(victim);
+ return;
+ } else if (victim != p) {
+ get_task_struct(p);
+ put_task_struct(victim);
+ victim = p;
+ }
+
+ /* Get a reference to safely compare mm after task_unlock(victim) */
+ mm = victim->mm;
+ mmgrab(mm);
+
+ /* Raise event before sending signal: task reaper must see this */
+ count_vm_event(OOM_KILL);
+ memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
+
+ /*
+ * We should send SIGKILL before granting access to memory reserves
+ * in order to prevent the OOM victim from depleting the memory
+ * reserves from the user space under its control.
+ */
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
+ mark_oom_victim(victim);
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
+ task_unlock(victim);
+
+ /*
+ * Kill all user processes sharing victim->mm in other thread groups, if
+ * any. They don't get access to memory reserves, though, to avoid
+ * depletion of all memory. This prevents mm->mmap_lock livelock when an
+ * oom killed thread cannot exit because it requires the semaphore and
+ * its contended by another thread trying to allocate memory itself.
+ * That thread will now get access to memory reserves since it has a
+ * pending fatal signal.
+ */
+ rcu_read_lock();
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(p, victim))
+ continue;
+ if (is_global_init(p)) {
+ can_oom_reap = false;
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
+ task_pid_nr(victim), victim->comm,
+ task_pid_nr(p), p->comm);
+ continue;
+ }
+ /*
+ * No kthead_use_mm() user needs to read from the userspace so
+ * we are ok to reap it.
+ */
+ if (unlikely(p->flags & PF_KTHREAD))
+ continue;
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+ }
+ rcu_read_unlock();
+
+ if (can_oom_reap)
+ queue_oom_reaper(victim);
+
+ mmdrop(mm);
+ put_task_struct(victim);
+}
+#undef K
+
+/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
+{
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
+ get_task_struct(task);
+ __oom_kill_process(task, message);
+ }
+ return 0;
+}
+
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *victim = oc->chosen;
+ struct mem_cgroup *oom_group;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(victim);
+ if (task_will_free_mem(victim)) {
+ mark_oom_victim(victim);
+ queue_oom_reaper(victim);
+ task_unlock(victim);
+ put_task_struct(victim);
+ return;
+ }
+ task_unlock(victim);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, victim);
+
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
+ __oom_kill_process(victim, message);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
+ (void*)message);
+ mem_cgroup_put(oom_group);
+ }
+}
+
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(struct oom_control *oc)
+{
+ if (likely(!sysctl_panic_on_oom))
+ return;
+ if (sysctl_panic_on_oom != 2) {
+ /*
+ * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+ * does not panic for cpuset, mempolicy, or memcg allocation
+ * failures.
+ */
+ if (oc->constraint != CONSTRAINT_NONE)
+ return;
+ }
+ /* Do not panic for oom kills triggered by sysrq */
+ if (is_sysrq_oom(oc))
+ return;
+ dump_header(oc, NULL);
+ panic("Out of memory: %s panic_on_oom is enabled\n",
+ sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
+}
+
+static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
+
+int register_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_oom_notifier);
+
+int unregister_oom_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+
+/**
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ */
+bool out_of_memory(struct oom_control *oc)
+{
+ unsigned long freed = 0;
+
+ if (oom_killer_disabled)
+ return false;
+
+ if (!is_memcg_oom(oc)) {
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ if (freed > 0)
+ /* Got some memory back in the last second. */
+ return true;
+ }
+
+ /*
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
+ */
+ if (task_will_free_mem(current)) {
+ mark_oom_victim(current);
+ queue_oom_reaper(current);
+ return true;
+ }
+
+ /*
+ * The OOM killer does not compensate for IO-less reclaim.
+ * pagefault_out_of_memory lost its gfp context so we have to
+ * make sure exclude 0 mask - all other users should have at least
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
+ */
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
+ return true;
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA and memcg) that may require different handling.
+ */
+ oc->constraint = constrained_alloc(oc);
+ if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc);
+
+ if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
+ current->mm && !oom_unkillable_task(current) &&
+ oom_cpuset_eligible(current, oc) &&
+ current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(current);
+ oc->chosen = current;
+ oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
+ return true;
+ }
+
+ select_bad_process(oc);
+ /* Found nothing?!?! */
+ if (!oc->chosen) {
+ dump_header(oc, NULL);
+ pr_warn("Out of memory and no killable processes...\n");
+ /*
+ * If we got here due to an actual allocation at the
+ * system level, we cannot survive this and will enter
+ * an endless loop in the allocator. Bail out now.
+ */
+ if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
+ panic("System is deadlocked on memory\n");
+ }
+ if (oc->chosen && oc->chosen != (void *)-1UL)
+ oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+ "Memory cgroup out of memory");
+ return !!oc->chosen;
+}
+
+/*
+ * The pagefault handler calls here because some allocation has failed. We have
+ * to take care of the memcg OOM here because this is the only safe context without
+ * any locks held but let the oom killer triggered from the allocation context care
+ * about the global OOM.
+ */
+void pagefault_out_of_memory(void)
+{
+ static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if (mem_cgroup_oom_synchronize(true))
+ return;
+
+ if (fatal_signal_pending(current))
+ return;
+
+ if (__ratelimit(&pfoom_rs))
+ pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
new file mode 100644
index 000000000..eb34d204d
--- /dev/null
+++ b/mm/page-writeback.c
@@ -0,0 +1,2850 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/page-writeback.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * Contains functions related to writing back dirty pages at the
+ * address_space level.
+ *
+ * 10Apr2002 Andrew Morton
+ * Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/init.h>
+#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/blkdev.h>
+#include <linux/mpage.h>
+#include <linux/rmap.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
+#include <linux/pagevec.h>
+#include <linux/timer.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
+#include <linux/mm_inline.h>
+#include <trace/events/writeback.h>
+
+#include "internal.h"
+
+/*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE max(HZ/5, 1)
+
+/*
+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
+ * by raising pause time to max_pause when falls below it.
+ */
+#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
+
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+
+#define RATELIMIT_CALC_SHIFT 10
+
+/*
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling.
+ */
+static long ratelimit_pages = 32;
+
+/* The following parameters are exported via /proc/sys/vm */
+
+/*
+ * Start background writeback (via writeback threads) at this percentage
+ */
+int dirty_background_ratio = 10;
+
+/*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
+
+/*
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
+ */
+int vm_highmem_is_dirtyable;
+
+/*
+ * The generator of dirty data starts writeback at this percentage
+ */
+int vm_dirty_ratio = 20;
+
+/*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
+
+/*
+ * The interval between `kupdate'-style writebacks
+ */
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+
+EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+
+/*
+ * The longest time for which data is allowed to remain dirty
+ */
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
+
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
+ */
+int laptop_mode;
+
+EXPORT_SYMBOL(laptop_mode);
+
+/* End of sysctl-exported parameters */
+
+struct wb_domain global_wb_domain;
+
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct wb_domain *dom;
+ struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
+#endif
+ struct bdi_writeback *wb;
+ struct fprop_local_percpu *wb_completions;
+
+ unsigned long avail; /* dirtyable */
+ unsigned long dirty; /* file_dirty + write + nfs */
+ unsigned long thresh; /* dirty threshold */
+ unsigned long bg_thresh; /* dirty background threshold */
+
+ unsigned long wb_dirty; /* per-wb counterparts */
+ unsigned long wb_thresh;
+ unsigned long wb_bg_thresh;
+
+ unsigned long pos_ratio;
+};
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#define GDTC_INIT(__wb) .wb = (__wb), \
+ .dom = &global_wb_domain, \
+ .wb_completions = &(__wb)->completions
+
+#define GDTC_INIT_NO_WB .dom = &global_wb_domain
+
+#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
+ .dom = mem_cgroup_wb_domain(__wb), \
+ .wb_completions = &(__wb)->memcg_completions, \
+ .gdtc = __gdtc
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return mdtc->gdtc;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return &wb->memcg_completions;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+ unsigned long long min = wb->bdi->min_ratio;
+ unsigned long long max = wb->bdi->max_ratio;
+
+ /*
+ * @wb may already be clean by the time control reaches here and
+ * the total may not include its bw.
+ */
+ if (this_bw < tot_bw) {
+ if (min) {
+ min *= this_bw;
+ min = div64_ul(min, tot_bw);
+ }
+ if (max < 100) {
+ max *= this_bw;
+ max = div64_ul(max, tot_bw);
+ }
+ }
+
+ *minp = min;
+ *maxp = max;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+#define GDTC_INIT(__wb) .wb = (__wb), \
+ .wb_completions = &(__wb)->completions
+#define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return false;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return &global_wb_domain;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return NULL;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return NULL;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ *minp = wb->bdi->min_ratio;
+ *maxp = wb->bdi->max_ratio;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effective number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
+/**
+ * node_dirtyable_memory - number of dirtyable pages in a node
+ * @pgdat: the node
+ *
+ * Return: the node's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-node dirty limits.
+ */
+static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
+{
+ unsigned long nr_pages = 0;
+ int z;
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ nr_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
+
+ nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
+ nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+ int i;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
+ struct zone *z;
+ unsigned long nr_pages;
+
+ if (!is_highmem_idx(i))
+ continue;
+
+ z = &NODE_DATA(node)->node_zones[i];
+ if (!populated_zone(z))
+ continue;
+
+ nr_pages = zone_page_state(z, NR_FREE_PAGES);
+ /* watch for underflows */
+ nr_pages -= min(nr_pages, high_wmark_pages(z));
+ nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
+ nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
+ x += nr_pages;
+ }
+ }
+
+ /*
+ * Unreclaimable memory (kernel memory or anonymous memory
+ * without swap) can bring down the dirtyable pages below
+ * the zone's dirty balance reserve and the above calculation
+ * will underflow. However we still want to add in nodes
+ * which are below threshold (negative values) to get a more
+ * accurate calculation but make sure that the total never
+ * underflows.
+ */
+ if ((long)x < 0)
+ x = 0;
+
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * global_dirtyable_memory - number of globally dirtyable pages
+ *
+ * Return: the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
+ */
+static unsigned long global_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_zone_page_state(NR_FREE_PAGES);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ x -= min(x, totalreserve_pages);
+
+ x += global_node_page_state(NR_INACTIVE_FILE);
+ x += global_node_page_state(NR_ACTIVE_FILE);
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
+ return x + 1; /* Ensure that we never return 0 */
+}
+
+/**
+ * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+ * @dtc: dirty_throttle_control of interest
+ *
+ * Calculate @dtc->thresh and ->bg_thresh considering
+ * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
+ * must ensure that @dtc->avail is set before calling this function. The
+ * dirty limits will be lifted by 1/4 for real-time tasks.
+ */
+static void domain_dirty_limits(struct dirty_throttle_control *dtc)
+{
+ const unsigned long available_memory = dtc->avail;
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+ unsigned long bytes = vm_dirty_bytes;
+ unsigned long bg_bytes = dirty_background_bytes;
+ /* convert ratios to per-PAGE_SIZE for higher precision */
+ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
+ unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
+ unsigned long thresh;
+ unsigned long bg_thresh;
+ struct task_struct *tsk;
+
+ /* gdtc is !NULL iff @dtc is for memcg domain */
+ if (gdtc) {
+ unsigned long global_avail = gdtc->avail;
+
+ /*
+ * The byte settings can't be applied directly to memcg
+ * domains. Convert them to ratios by scaling against
+ * globally available memory. As the ratios are in
+ * per-PAGE_SIZE, they can be obtained by dividing bytes by
+ * number of pages.
+ */
+ if (bytes)
+ ratio = min(DIV_ROUND_UP(bytes, global_avail),
+ PAGE_SIZE);
+ if (bg_bytes)
+ bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
+ PAGE_SIZE);
+ bytes = bg_bytes = 0;
+ }
+
+ if (bytes)
+ thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
+ else
+ thresh = (ratio * available_memory) / PAGE_SIZE;
+
+ if (bg_bytes)
+ bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
+ else
+ bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
+
+ if (bg_thresh >= thresh)
+ bg_thresh = thresh / 2;
+ tsk = current;
+ if (rt_task(tsk)) {
+ bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
+ thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
+ }
+ dtc->thresh = thresh;
+ dtc->bg_thresh = bg_thresh;
+
+ /* we should eventually report the domain in the TP */
+ if (!gdtc)
+ trace_global_dirty_state(bg_thresh, thresh);
+}
+
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * @pbackground: out parameter for bg_thresh
+ * @pdirty: out parameter for thresh
+ *
+ * Calculate bg_thresh and thresh for global_wb_domain. See
+ * domain_dirty_limits() for details.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+ gdtc.avail = global_dirtyable_memory();
+ domain_dirty_limits(&gdtc);
+
+ *pbackground = gdtc.bg_thresh;
+ *pdirty = gdtc.thresh;
+}
+
+/**
+ * node_dirty_limit - maximum number of dirty pages allowed in a node
+ * @pgdat: the node
+ *
+ * Return: the maximum number of dirty pages allowed in a node, based
+ * on the node's dirtyable memory.
+ */
+static unsigned long node_dirty_limit(struct pglist_data *pgdat)
+{
+ unsigned long node_memory = node_dirtyable_memory(pgdat);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ node_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * node_memory / 100;
+
+ if (rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * node_dirty_ok - tells whether a node is within its dirty limits
+ * @pgdat: the node to check
+ *
+ * Return: %true when the dirty pages in @pgdat are within the node's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool node_dirty_ok(struct pglist_data *pgdat)
+{
+ unsigned long limit = node_dirty_limit(pgdat);
+ unsigned long nr_pages = 0;
+
+ nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
+ nr_pages += node_page_state(pgdat, NR_WRITEBACK);
+
+ return nr_pages <= limit;
+}
+
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_bytes = 0;
+ return ret;
+}
+
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_ratio = 0;
+ return ret;
+}
+
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_ratio = vm_dirty_ratio;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ writeback_set_ratelimit();
+ vm_dirty_bytes = 0;
+ }
+ return ret;
+}
+
+int dirty_bytes_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned long old_bytes = vm_dirty_bytes;
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+ writeback_set_ratelimit();
+ vm_dirty_ratio = 0;
+ }
+ return ret;
+}
+
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+ cur_time += VM_COMPLETIONS_PERIOD_LEN;
+ /* 0 has a special meaning... */
+ if (!cur_time)
+ return 1;
+ return cur_time;
+}
+
+static void wb_domain_writeout_inc(struct wb_domain *dom,
+ struct fprop_local_percpu *completions,
+ unsigned int max_prop_frac)
+{
+ __fprop_inc_percpu_max(&dom->completions, completions,
+ max_prop_frac);
+ /* First event after period switching was turned off? */
+ if (unlikely(!dom->period_time)) {
+ /*
+ * We can race with other __bdi_writeout_inc calls here but
+ * it does not cause any harm since the resulting time when
+ * timer will fire and what is in writeout_period_time will be
+ * roughly the same.
+ */
+ dom->period_time = wp_next_time(jiffies);
+ mod_timer(&dom->period_timer, dom->period_time);
+ }
+}
+
+/*
+ * Increment @wb's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+{
+ struct wb_domain *cgdom;
+
+ inc_wb_stat(wb, WB_WRITTEN);
+ wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac);
+
+ cgdom = mem_cgroup_wb_domain(wb);
+ if (cgdom)
+ wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+ wb->bdi->max_prop_frac);
+}
+
+void wb_writeout_inc(struct bdi_writeback *wb)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wb_writeout_inc(wb);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
+
+/*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(struct timer_list *t)
+{
+ struct wb_domain *dom = from_timer(dom, t, period_timer);
+ int miss_periods = (jiffies - dom->period_time) /
+ VM_COMPLETIONS_PERIOD_LEN;
+
+ if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+ dom->period_time = wp_next_time(dom->period_time +
+ miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+ mod_timer(&dom->period_timer, dom->period_time);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ dom->period_time = 0;
+ }
+}
+
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+ memset(dom, 0, sizeof(*dom));
+
+ spin_lock_init(&dom->lock);
+
+ timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
+
+ dom->dirty_limit_tstamp = jiffies;
+
+ return fprop_global_init(&dom->completions, gfp);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+ del_timer_sync(&dom->period_timer);
+ fprop_global_destroy(&dom->completions);
+}
+#endif
+
+/*
+ * bdi_min_ratio keeps the sum of the minimum dirty shares of all
+ * registered backing devices, which, for obvious reasons, can not
+ * exceed 100%.
+ */
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ int ret = 0;
+
+ spin_lock_bh(&bdi_lock);
+ if (min_ratio > bdi->max_ratio) {
+ ret = -EINVAL;
+ } else {
+ min_ratio -= bdi->min_ratio;
+ if (bdi_min_ratio + min_ratio < 100) {
+ bdi_min_ratio += min_ratio;
+ bdi->min_ratio += min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+ int ret = 0;
+
+ if (max_ratio > 100)
+ return -EINVAL;
+
+ spin_lock_bh(&bdi_lock);
+ if (bdi->min_ratio > max_ratio) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_ratio = max_ratio;
+ bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+ unsigned long bg_thresh)
+{
+ return (thresh + bg_thresh) / 2;
+}
+
+static unsigned long hard_dirty_limit(struct wb_domain *dom,
+ unsigned long thresh)
+{
+ return max(thresh, dom->dirty_limit);
+}
+
+/*
+ * Memory which can be further allocated to a memcg domain is capped by
+ * system-wide clean memory excluding the amount being used in the domain.
+ */
+static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
+ unsigned long filepages, unsigned long headroom)
+{
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+ unsigned long clean = filepages - min(filepages, mdtc->dirty);
+ unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+ unsigned long other_clean = global_clean - min(global_clean, clean);
+
+ mdtc->avail = filepages + min(headroom, other_clean);
+}
+
+/**
+ * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * @dtc: dirty_throttle_context of interest
+ *
+ * Note that balance_dirty_pages() will only seriously take it as a hard limit
+ * when sleeping max_pause per page is not enough to keep the dirty pages under
+ * control. For example, when the device is completely stalled due to some error
+ * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * In the other normal situations, it acts more gently by throttling the tasks
+ * more (rather than completely block them) when the wb dirty pages go high.
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The wb's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ *
+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty and PG_writeback pages.
+ */
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
+{
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ u64 wb_thresh;
+ unsigned long numerator, denominator;
+ unsigned long wb_min_ratio, wb_max_ratio;
+
+ /*
+ * Calculate this BDI's share of the thresh ratio.
+ */
+ fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+ &numerator, &denominator);
+
+ wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+ wb_thresh *= numerator;
+ wb_thresh = div64_ul(wb_thresh, denominator);
+
+ wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
+
+ wb_thresh += (thresh * wb_min_ratio) / 100;
+ if (wb_thresh > (thresh * wb_max_ratio) / 100)
+ wb_thresh = thresh * wb_max_ratio / 100;
+
+ return wb_thresh;
+}
+
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+ .thresh = thresh };
+ return __wb_calc_thresh(&gdtc);
+}
+
+/*
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
+ */
+static long long pos_ratio_polynom(unsigned long setpoint,
+ unsigned long dirty,
+ unsigned long limit)
+{
+ long long pos_ratio;
+ long x;
+
+ x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ (limit - setpoint) | 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+ return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
+}
+
+/*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/wb setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ * if (dirty < setpoint) scale up pos_ratio
+ * if (dirty > setpoint) scale down pos_ratio
+ *
+ * if (wb_dirty < wb_setpoint) scale up pos_ratio
+ * if (wb_dirty > wb_setpoint) scale down pos_ratio
+ *
+ * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ * ^ pos_ratio
+ * |
+ * | |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ * | .*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1.0 ................................*
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * 0 +------------.------------------.----------------------*------------->
+ * freerun^ setpoint^ limit^ dirty pages
+ *
+ * (o) wb control line
+ *
+ * ^ pos_ratio
+ * |
+ * | *
+ * | *
+ * | *
+ * | *
+ * | * |<=========== span ============>|
+ * 1.0 .......................*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ * | . .
+ * | . .
+ * | . .
+ * 0 +----------------------.-------------------------------.------------->
+ * wb_setpoint^ x_intercept^
+ *
+ * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ * card's wb_dirty may rush to many times higher than wb_setpoint.
+ * - the wb dirty thresh drops quickly due to change of JBOD workload
+ */
+static void wb_position_ratio(struct dirty_throttle_control *dtc)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long wb_thresh = dtc->wb_thresh;
+ unsigned long x_intercept;
+ unsigned long setpoint; /* dirty pages' target balance point */
+ unsigned long wb_setpoint;
+ unsigned long span;
+ long long pos_ratio; /* for scaling up/down the rate limit */
+ long x;
+
+ dtc->pos_ratio = 0;
+
+ if (unlikely(dtc->dirty >= limit))
+ return;
+
+ /*
+ * global setpoint
+ *
+ * See comment for pos_ratio_polynom().
+ */
+ setpoint = (freerun + limit) / 2;
+ pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
+
+ /*
+ * The strictlimit feature is a tool preventing mistrusted filesystems
+ * from growing a large number of dirty pages before throttling. For
+ * such filesystems balance_dirty_pages always checks wb counters
+ * against wb limits. Even if global "nr_dirty" is under "freerun".
+ * This is especially important for fuse which sets bdi->max_ratio to
+ * 1% by default. Without strictlimit feature, fuse writeback may
+ * consume arbitrary amount of RAM because it is accounted in
+ * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ *
+ * Here, in wb_position_ratio(), we calculate pos_ratio based on
+ * two values: wb_dirty and wb_thresh. Let's consider an example:
+ * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
+ * limits are set by default to 10% and 20% (background and throttle).
+ * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+ * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+ * about ~6K pages (as the average of background and throttle wb
+ * limits). The 3rd order polynomial will provide positive feedback if
+ * wb_dirty is under wb_setpoint and vice versa.
+ *
+ * Note, that we cannot use global counters in these calculations
+ * because we want to throttle process writing to a strictlimit wb
+ * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
+ * in the example above).
+ */
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ long long wb_pos_ratio;
+
+ if (dtc->wb_dirty < 8) {
+ dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+ 2 << RATELIMIT_CALC_SHIFT);
+ return;
+ }
+
+ if (dtc->wb_dirty >= wb_thresh)
+ return;
+
+ wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+ dtc->wb_bg_thresh);
+
+ if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+ return;
+
+ wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+ wb_thresh);
+
+ /*
+ * Typically, for strictlimit case, wb_setpoint << setpoint
+ * and pos_ratio >> wb_pos_ratio. In the other words global
+ * state ("dirty") is not limiting factor and we have to
+ * make decision based on wb counters. But there is an
+ * important case when global pos_ratio should get precedence:
+ * global limits are exceeded (e.g. due to activities on other
+ * wb's) while given strictlimit wb is below limit.
+ *
+ * "pos_ratio * wb_pos_ratio" would work for the case above,
+ * but it would look too non-natural for the case of all
+ * activity in the system coming from a single strictlimit wb
+ * with bdi->max_ratio == 100%.
+ *
+ * Note that min() below somewhat changes the dynamics of the
+ * control system. Normally, pos_ratio value can be well over 3
+ * (when globally we are at freerun and wb is well below wb
+ * setpoint). Now the maximum pos_ratio in the same situation
+ * is 2. We might want to tweak this if we observe the control
+ * system is too slow to adapt.
+ */
+ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+ return;
+ }
+
+ /*
+ * We have computed basic pos_ratio above based on global situation. If
+ * the wb is over/under its share of dirty pages, we want to scale
+ * pos_ratio further down/up. That is done by the following mechanism.
+ */
+
+ /*
+ * wb setpoint
+ *
+ * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
+ *
+ * x_intercept - wb_dirty
+ * := --------------------------
+ * x_intercept - wb_setpoint
+ *
+ * The main wb control line is a linear function that subjects to
+ *
+ * (1) f(wb_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single wb case)
+ * or equally: x_intercept = wb_setpoint + 8 * write_bw
+ *
+ * For single wb case, the dirty pages are observed to fluctuate
+ * regularly within range
+ * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
+ * for various filesystems, where (2) can yield in a reasonable 12.5%
+ * fluctuation range for pos_ratio.
+ *
+ * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
+ * own size, so move the slope over accordingly and choose a slope that
+ * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
+ */
+ if (unlikely(wb_thresh > dtc->thresh))
+ wb_thresh = dtc->thresh;
+ /*
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
+ */
+ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
+ /*
+ * scale global setpoint to wb's:
+ * wb_setpoint = setpoint * wb_thresh / thresh
+ */
+ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+ wb_setpoint = setpoint * (u64)x >> 16;
+ /*
+ * Use span=(8*write_bw) in single wb case as indicated by
+ * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
+ *
+ * wb_thresh thresh - wb_thresh
+ * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+ * thresh thresh
+ */
+ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = wb_setpoint + span;
+
+ if (dtc->wb_dirty < x_intercept - span / 4) {
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
+ (x_intercept - wb_setpoint) | 1);
+ } else
+ pos_ratio /= 4;
+
+ /*
+ * wb reserve area, safeguard against dirty pool underrun and disk idle
+ * It may push the desired control point of global dirty pages higher
+ * than setpoint.
+ */
+ x_intercept = wb_thresh / 2;
+ if (dtc->wb_dirty < x_intercept) {
+ if (dtc->wb_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept,
+ dtc->wb_dirty);
+ else
+ pos_ratio *= 8;
+ }
+
+ dtc->pos_ratio = pos_ratio;
+}
+
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+ unsigned long elapsed,
+ unsigned long written)
+{
+ const unsigned long period = roundup_pow_of_two(3 * HZ);
+ unsigned long avg = wb->avg_write_bandwidth;
+ unsigned long old = wb->write_bandwidth;
+ u64 bw;
+
+ /*
+ * bw = written * HZ / elapsed
+ *
+ * bw * elapsed + write_bandwidth * (period - elapsed)
+ * write_bandwidth = ---------------------------------------------------
+ * period
+ *
+ * @written may have decreased due to account_page_redirty().
+ * Avoid underflowing @bw calculation.
+ */
+ bw = written - min(written, wb->written_stamp);
+ bw *= HZ;
+ if (unlikely(elapsed > period)) {
+ bw = div64_ul(bw, elapsed);
+ avg = bw;
+ goto out;
+ }
+ bw += (u64)wb->write_bandwidth * (period - elapsed);
+ bw >>= ilog2(period);
+
+ /*
+ * one more level of smoothing, for filtering out sudden spikes
+ */
+ if (avg > old && old >= (unsigned long)bw)
+ avg -= (avg - old) >> 3;
+
+ if (avg < old && old <= (unsigned long)bw)
+ avg += (old - avg) >> 3;
+
+out:
+ /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+ avg = max(avg, 1LU);
+ if (wb_has_dirty_io(wb)) {
+ long delta = avg - wb->avg_write_bandwidth;
+ WARN_ON_ONCE(atomic_long_add_return(delta,
+ &wb->bdi->tot_write_bandwidth) <= 0);
+ }
+ wb->write_bandwidth = bw;
+ wb->avg_write_bandwidth = avg;
+}
+
+static void update_dirty_limit(struct dirty_throttle_control *dtc)
+{
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ unsigned long limit = dom->dirty_limit;
+
+ /*
+ * Follow up in one step.
+ */
+ if (limit < thresh) {
+ limit = thresh;
+ goto update;
+ }
+
+ /*
+ * Follow down slowly. Use the higher one as the target, because thresh
+ * may drop below dirty. This is exactly the reason to introduce
+ * dom->dirty_limit which is guaranteed to lie above the dirty pages.
+ */
+ thresh = max(thresh, dtc->dirty);
+ if (limit > thresh) {
+ limit -= (limit - thresh) >> 5;
+ goto update;
+ }
+ return;
+update:
+ dom->dirty_limit = limit;
+}
+
+static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
+ unsigned long now)
+{
+ struct wb_domain *dom = dtc_dom(dtc);
+
+ /*
+ * check locklessly first to optimize away locking for the most time
+ */
+ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
+ return;
+
+ spin_lock(&dom->lock);
+ if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(dtc);
+ dom->dirty_limit_tstamp = now;
+ }
+ spin_unlock(&dom->lock);
+}
+
+/*
+ * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal wb tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long dirty = dtc->dirty;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long setpoint = (freerun + limit) / 2;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = wb->dirty_ratelimit;
+ unsigned long dirty_rate;
+ unsigned long task_ratelimit;
+ unsigned long balanced_dirty_ratelimit;
+ unsigned long step;
+ unsigned long x;
+ unsigned long shift;
+
+ /*
+ * The dirty rate will match the writeout rate in long term, except
+ * when dirty pages are truncated by userspace or re-dirtied by FS.
+ */
+ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
+
+ /*
+ * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+ */
+ task_ratelimit = (u64)dirty_ratelimit *
+ dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
+ task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+
+ /*
+ * A linear estimation of the "balanced" throttle rate. The theory is,
+ * if there are N dd tasks, each throttled at task_ratelimit, the wb's
+ * dirty_rate will be measured to be (N * task_ratelimit). So the below
+ * formula will yield the balanced rate limit (write_bw / N).
+ *
+ * Note that the expanded form is not a pure rate feedback:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
+ * but also takes pos_ratio into account:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
+ *
+ * (1) is not realistic because pos_ratio also takes part in balancing
+ * the dirty rate. Consider the state
+ * pos_ratio = 0.5 (3)
+ * rate = 2 * (write_bw / N) (4)
+ * If (1) is used, it will stuck in that state! Because each dd will
+ * be throttled at
+ * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
+ * yielding
+ * dirty_rate = N * task_ratelimit = write_bw (6)
+ * put (6) into (1) we get
+ * rate_(i+1) = rate_(i) (7)
+ *
+ * So we end up using (2) to always keep
+ * rate_(i+1) ~= (write_bw / N) (8)
+ * regardless of the value of pos_ratio. As long as (8) is satisfied,
+ * pos_ratio is able to drive itself to 1.0, which is not only where
+ * the dirty count meet the setpoint, but also where the slope of
+ * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+ */
+ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+ dirty_rate | 1);
+ /*
+ * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
+ */
+ if (unlikely(balanced_dirty_ratelimit > write_bw))
+ balanced_dirty_ratelimit = write_bw;
+
+ /*
+ * We could safely do this and return immediately:
+ *
+ * wb->dirty_ratelimit = balanced_dirty_ratelimit;
+ *
+ * However to get a more stable dirty_ratelimit, the below elaborated
+ * code makes use of task_ratelimit to filter out singular points and
+ * limit the step size.
+ *
+ * The below code essentially only uses the relative value of
+ *
+ * task_ratelimit - dirty_ratelimit
+ * = (pos_ratio - 1) * dirty_ratelimit
+ *
+ * which reflects the direction and size of dirty position error.
+ */
+
+ /*
+ * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+ * task_ratelimit is on the same side of dirty_ratelimit, too.
+ * For example, when
+ * - dirty_ratelimit > balanced_dirty_ratelimit
+ * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+ * lowering dirty_ratelimit will help meet both the position and rate
+ * control targets. Otherwise, don't update dirty_ratelimit if it will
+ * only help meet the rate target. After all, what the users ultimately
+ * feel and care are stable dirty rate and small position error.
+ *
+ * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+ * and filter out the singular points of balanced_dirty_ratelimit. Which
+ * keeps jumping around randomly and can even leap far away at times
+ * due to the small 200ms estimation period of dirty_rate (we want to
+ * keep that period small to reduce time lags).
+ */
+ step = 0;
+
+ /*
+ * For strictlimit case, calculations above were based on wb counters
+ * and limits (starting from pos_ratio = wb_position_ratio() and up to
+ * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
+ * Hence, to calculate "step" properly, we have to use wb_dirty as
+ * "dirty" and wb_setpoint as "setpoint".
+ *
+ * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+ * it's possible that wb_thresh is close to zero due to inactivity
+ * of backing device.
+ */
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ dirty = dtc->wb_dirty;
+ if (dtc->wb_dirty < 8)
+ setpoint = dtc->wb_dirty + 1;
+ else
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ }
+
+ if (dirty < setpoint) {
+ x = min3(wb->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
+ if (dirty_ratelimit < x)
+ step = x - dirty_ratelimit;
+ } else {
+ x = max3(wb->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
+ if (dirty_ratelimit > x)
+ step = dirty_ratelimit - x;
+ }
+
+ /*
+ * Don't pursue 100% rate matching. It's impossible since the balanced
+ * rate itself is constantly fluctuating. So decrease the track speed
+ * when it gets close to the target. Helps eliminate pointless tremors.
+ */
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
+
+ if (dirty_ratelimit < balanced_dirty_ratelimit)
+ dirty_ratelimit += step;
+ else
+ dirty_ratelimit -= step;
+
+ wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+
+ trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
+}
+
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+ struct dirty_throttle_control *mdtc,
+ unsigned long start_time,
+ bool update_ratelimit)
+{
+ struct bdi_writeback *wb = gdtc->wb;
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - wb->bw_time_stamp;
+ unsigned long dirtied;
+ unsigned long written;
+
+ lockdep_assert_held(&wb->list_lock);
+
+ /*
+ * rate-limit, only update once every 200ms.
+ */
+ if (elapsed < BANDWIDTH_INTERVAL)
+ return;
+
+ dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+ written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
+
+ /*
+ * Skip quiet periods when disk bandwidth is under-utilized.
+ * (at least 1s idle time between two flusher runs)
+ */
+ if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
+ goto snapshot;
+
+ if (update_ratelimit) {
+ domain_update_bandwidth(gdtc, now);
+ wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+ /*
+ * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+ * compiler has no way to figure that out. Help it.
+ */
+ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+ domain_update_bandwidth(mdtc, now);
+ wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+ }
+ }
+ wb_update_write_bandwidth(wb, elapsed, written);
+
+snapshot:
+ wb->dirtied_stamp = dirtied;
+ wb->written_stamp = written;
+ wb->bw_time_stamp = now;
+}
+
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+ __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+}
+
+/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+ unsigned long thresh)
+{
+ if (thresh > dirty)
+ return 1UL << (ilog2(thresh - dirty) >> 1);
+
+ return 1;
+}
+
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+ unsigned long wb_dirty)
+{
+ unsigned long bw = wb->avg_write_bandwidth;
+ unsigned long t;
+
+ /*
+ * Limit pause time for small memory systems. If sleeping for too long
+ * time, a small pool of dirty/writeback pages may go empty and disk go
+ * idle.
+ *
+ * 8 serves as the safety ratio.
+ */
+ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+ t++;
+
+ return min_t(unsigned long, t, MAX_PAUSE);
+}
+
+static long wb_min_pause(struct bdi_writeback *wb,
+ long max_pause,
+ unsigned long task_ratelimit,
+ unsigned long dirty_ratelimit,
+ int *nr_dirtied_pause)
+{
+ long hi = ilog2(wb->avg_write_bandwidth);
+ long lo = ilog2(wb->dirty_ratelimit);
+ long t; /* target pause */
+ long pause; /* estimated next pause */
+ int pages; /* target nr_dirtied_pause */
+
+ /* target for 10ms pause on 1-dd case */
+ t = max(1, HZ / 100);
+
+ /*
+ * Scale up pause time for concurrent dirtiers in order to reduce CPU
+ * overheads.
+ *
+ * (N * 10ms) on 2^N concurrent tasks.
+ */
+ if (hi > lo)
+ t += (hi - lo) * (10 * HZ) / 1024;
+
+ /*
+ * This is a bit convoluted. We try to base the next nr_dirtied_pause
+ * on the much more stable dirty_ratelimit. However the next pause time
+ * will be computed based on task_ratelimit and the two rate limits may
+ * depart considerably at some time. Especially if task_ratelimit goes
+ * below dirty_ratelimit/2 and the target pause is max_pause, the next
+ * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
+ * result task_ratelimit won't be executed faithfully, which could
+ * eventually bring down dirty_ratelimit.
+ *
+ * We apply two rules to fix it up:
+ * 1) try to estimate the next pause time and if necessary, use a lower
+ * nr_dirtied_pause so as not to exceed max_pause. When this happens,
+ * nr_dirtied_pause will be "dancing" with task_ratelimit.
+ * 2) limit the target pause time to max_pause/2, so that the normal
+ * small fluctuations of task_ratelimit won't trigger rule (1) and
+ * nr_dirtied_pause will remain as stable as dirty_ratelimit.
+ */
+ t = min(t, 1 + max_pause / 2);
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+
+ /*
+ * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
+ * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
+ * When the 16 consecutive reads are often interrupted by some dirty
+ * throttling pause during the async writes, cfq will go into idles
+ * (deadline is fine). So push nr_dirtied_pause as high as possible
+ * until reaches DIRTY_POLL_THRESH=32 pages.
+ */
+ if (pages < DIRTY_POLL_THRESH) {
+ t = max_pause;
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+ if (pages > DIRTY_POLL_THRESH) {
+ pages = DIRTY_POLL_THRESH;
+ t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
+ }
+ }
+
+ pause = HZ * pages / (task_ratelimit + 1);
+ if (pause > max_pause) {
+ t = max_pause;
+ pages = task_ratelimit * t / roundup_pow_of_two(HZ);
+ }
+
+ *nr_dirtied_pause = pages;
+ /*
+ * The minimal pause time will normally be half the target pause time.
+ */
+ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
+}
+
+static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long wb_reclaimable;
+
+ /*
+ * wb_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, wb_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (wb_dirty >> wb_thresh) either because
+ * wb_dirty starts high, or because wb_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until wb_dirty drops under
+ * wb_thresh. Instead the auxiliary wb control line in
+ * wb_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down wb_dirty.
+ */
+ dtc->wb_thresh = __wb_calc_thresh(dtc);
+ dtc->wb_bg_thresh = dtc->thresh ?
+ div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (dtc->wb_thresh < 2 * wb_stat_error()) {
+ wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
+ } else {
+ wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
+ }
+}
+
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data. It looks at the number of dirty pages in the machine and will force
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages(struct bdi_writeback *wb,
+ unsigned long pages_dirtied)
+{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+ struct dirty_throttle_control *sdtc;
+ unsigned long nr_reclaimable; /* = file_dirty */
+ long period;
+ long pause;
+ long max_pause;
+ long min_pause;
+ int nr_dirtied_pause;
+ bool dirty_exceeded = false;
+ unsigned long task_ratelimit;
+ unsigned long dirty_ratelimit;
+ struct backing_dev_info *bdi = wb->bdi;
+ bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
+ unsigned long start_time = jiffies;
+
+ for (;;) {
+ unsigned long now = jiffies;
+ unsigned long dirty, thresh, bg_thresh;
+ unsigned long m_dirty = 0; /* stop bogus uninit warnings */
+ unsigned long m_thresh = 0;
+ unsigned long m_bg_thresh = 0;
+
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+
+ domain_dirty_limits(gdtc);
+
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(gdtc);
+
+ dirty = gdtc->wb_dirty;
+ thresh = gdtc->wb_thresh;
+ bg_thresh = gdtc->wb_bg_thresh;
+ } else {
+ dirty = gdtc->dirty;
+ thresh = gdtc->thresh;
+ bg_thresh = gdtc->bg_thresh;
+ }
+
+ if (mdtc) {
+ unsigned long filepages, headroom, writeback;
+
+ /*
+ * If @wb belongs to !root memcg, repeat the same
+ * basic calculations for the memcg domain.
+ */
+ mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc->dirty, &writeback);
+ mdtc->dirty += writeback;
+ mdtc_calc_avail(mdtc, filepages, headroom);
+
+ domain_dirty_limits(mdtc);
+
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(mdtc);
+ m_dirty = mdtc->wb_dirty;
+ m_thresh = mdtc->wb_thresh;
+ m_bg_thresh = mdtc->wb_bg_thresh;
+ } else {
+ m_dirty = mdtc->dirty;
+ m_thresh = mdtc->thresh;
+ m_bg_thresh = mdtc->bg_thresh;
+ }
+ }
+
+ /*
+ * Throttle it only when the background writeback cannot
+ * catch-up. This avoids (excessively) small writeouts
+ * when the wb limits are ramping up in case of !strictlimit.
+ *
+ * In strictlimit case make decision based on the wb counters
+ * and limits. Small writeouts when the wb limits are ramping
+ * up are the price we consciously pay for strictlimit-ing.
+ *
+ * If memcg domain is in effect, @dirty should be under
+ * both global and memcg freerun ceilings.
+ */
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+ (!mdtc ||
+ m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+ unsigned long intv;
+ unsigned long m_intv;
+
+free_running:
+ intv = dirty_poll_interval(dirty, thresh);
+ m_intv = ULONG_MAX;
+
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ if (mdtc)
+ m_intv = dirty_poll_interval(m_dirty, m_thresh);
+ current->nr_dirtied_pause = min(intv, m_intv);
+ break;
+ }
+
+ if (unlikely(!writeback_in_progress(wb)))
+ wb_start_background_writeback(wb);
+
+ mem_cgroup_flush_foreign(wb);
+
+ /*
+ * Calculate global domain's pos_ratio and select the
+ * global dtc by default.
+ */
+ if (!strictlimit) {
+ wb_dirty_limits(gdtc);
+
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ gdtc->wb_dirty <
+ dirty_freerun_ceiling(gdtc->wb_thresh,
+ gdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be throttled
+ * when below the per-wb freerun ceiling.
+ */
+ goto free_running;
+ }
+
+ dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+ ((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+ wb_position_ratio(gdtc);
+ sdtc = gdtc;
+
+ if (mdtc) {
+ /*
+ * If memcg domain is in effect, calculate its
+ * pos_ratio. @wb should satisfy constraints from
+ * both global and memcg domains. Choose the one
+ * w/ lower pos_ratio.
+ */
+ if (!strictlimit) {
+ wb_dirty_limits(mdtc);
+
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ mdtc->wb_dirty <
+ dirty_freerun_ceiling(mdtc->wb_thresh,
+ mdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be
+ * throttled when below the per-wb
+ * freerun ceiling.
+ */
+ goto free_running;
+ }
+ dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+ ((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+ wb_position_ratio(mdtc);
+ if (mdtc->pos_ratio < gdtc->pos_ratio)
+ sdtc = mdtc;
+ }
+
+ if (dirty_exceeded && !wb->dirty_exceeded)
+ wb->dirty_exceeded = 1;
+
+ if (time_is_before_jiffies(wb->bw_time_stamp +
+ BANDWIDTH_INTERVAL)) {
+ spin_lock(&wb->list_lock);
+ __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+ spin_unlock(&wb->list_lock);
+ }
+
+ /* throttle according to the chosen dtc */
+ dirty_ratelimit = wb->dirty_ratelimit;
+ task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
+ RATELIMIT_CALC_SHIFT;
+ max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+ min_pause = wb_min_pause(wb, max_pause,
+ task_ratelimit, dirty_ratelimit,
+ &nr_dirtied_pause);
+
+ if (unlikely(task_ratelimit == 0)) {
+ period = max_pause;
+ pause = max_pause;
+ goto pause;
+ }
+ period = HZ * pages_dirtied / task_ratelimit;
+ pause = period;
+ if (current->dirty_paused_when)
+ pause -= now - current->dirty_paused_when;
+ /*
+ * For less than 1s think time (ext3/4 may block the dirtier
+ * for up to 800ms from time to time on 1-HDD; so does xfs,
+ * however at much less frequency), try to compensate it in
+ * future periods by updating the virtual time; otherwise just
+ * do a reset, as it may be a light dirtier.
+ */
+ if (pause < min_pause) {
+ trace_balance_dirty_pages(wb,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ min(pause, 0L),
+ start_time);
+ if (pause < -HZ) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ } else if (period) {
+ current->dirty_paused_when += period;
+ current->nr_dirtied = 0;
+ } else if (current->nr_dirtied_pause <= pages_dirtied)
+ current->nr_dirtied_pause += pages_dirtied;
+ break;
+ }
+ if (unlikely(pause > max_pause)) {
+ /* for occasional dropped task_ratelimit */
+ now += min(pause - max_pause, max_pause);
+ pause = max_pause;
+ }
+
+pause:
+ trace_balance_dirty_pages(wb,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ pause,
+ start_time);
+ __set_current_state(TASK_KILLABLE);
+ wb->dirty_sleep = now;
+ io_schedule_timeout(pause);
+
+ current->dirty_paused_when = now + pause;
+ current->nr_dirtied = 0;
+ current->nr_dirtied_pause = nr_dirtied_pause;
+
+ /*
+ * This is typically equal to (dirty < thresh) and can also
+ * keep "1000+ dd on a slow USB stick" under control.
+ */
+ if (task_ratelimit)
+ break;
+
+ /*
+ * In the case of an unresponding NFS server and the NFS dirty
+ * pages exceeds dirty_thresh, give the other good wb's a pipe
+ * to go through, so that tasks on them still remain responsive.
+ *
+ * In theory 1 page is enough to keep the consumer-producer
+ * pipe going: the flusher cleans 1 page => the task dirties 1
+ * more page. However wb_dirty has accounting errors. So use
+ * the larger and more IO friendly wb_stat_error.
+ */
+ if (sdtc->wb_dirty <= wb_stat_error())
+ break;
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ if (!dirty_exceeded && wb->dirty_exceeded)
+ wb->dirty_exceeded = 0;
+
+ if (writeback_in_progress(wb))
+ return;
+
+ /*
+ * In laptop mode, we wait until hitting the higher threshold before
+ * starting background writeout, and then write out all the way down
+ * to the lower threshold. So slow writers cause minimal disk activity.
+ *
+ * In normal mode, we start background writeout at the lower
+ * background_thresh, to keep the amount of dirty memory low.
+ */
+ if (laptop_mode)
+ return;
+
+ if (nr_reclaimable > gdtc->bg_thresh)
+ wb_start_background_writeback(wb);
+}
+
+static DEFINE_PER_CPU(int, bdp_ratelimits);
+
+/*
+ * Normal tasks are throttled by
+ * loop {
+ * dirty tsk->nr_dirtied_pause pages;
+ * take a snap in balance_dirty_pages();
+ * }
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting). But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
+ int ratelimit;
+ int *p;
+
+ if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
+ return;
+
+ if (inode_cgwb_enabled(inode))
+ wb = wb_get_create_current(bdi, GFP_KERNEL);
+ if (!wb)
+ wb = &bdi->wb;
+
+ ratelimit = current->nr_dirtied_pause;
+ if (wb->dirty_exceeded)
+ ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+
+ preempt_disable();
+ /*
+ * This prevents one CPU to accumulate too many dirtied pages without
+ * calling into balance_dirty_pages(), which can happen when there are
+ * 1000+ tasks, all of them start dirtying pages at exactly the same
+ * time, hence all honoured too large initial task->nr_dirtied_pause.
+ */
+ p = this_cpu_ptr(&bdp_ratelimits);
+ if (unlikely(current->nr_dirtied >= ratelimit))
+ *p = 0;
+ else if (unlikely(*p >= ratelimit_pages)) {
+ *p = 0;
+ ratelimit = 0;
+ }
+ /*
+ * Pick up the dirtied pages by the exited tasks. This avoids lots of
+ * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+ * the dirty throttling and livelock other long-run dirtiers.
+ */
+ p = this_cpu_ptr(&dirty_throttle_leaks);
+ if (*p > 0 && current->nr_dirtied < ratelimit) {
+ unsigned long nr_pages_dirtied;
+ nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+ *p -= nr_pages_dirtied;
+ current->nr_dirtied += nr_pages_dirtied;
+ }
+ preempt_enable();
+
+ if (unlikely(current->nr_dirtied >= ratelimit))
+ balance_dirty_pages(wb, current->nr_dirtied);
+
+ wb_put(wb);
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough.
+ *
+ * Return: %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+
+ /*
+ * Similar to balance_dirty_pages() but ignores pages being written
+ * as we're trying to decide whether to put more under writeback.
+ */
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
+ domain_dirty_limits(gdtc);
+
+ if (gdtc->dirty > gdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
+ return true;
+
+ if (mdtc) {
+ unsigned long filepages, headroom, writeback;
+
+ mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
+ &writeback);
+ mdtc_calc_avail(mdtc, filepages, headroom);
+ domain_dirty_limits(mdtc); /* ditto, ignore writeback */
+
+ if (mdtc->dirty > mdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
+ */
+int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ unsigned int old_interval = dirty_writeback_interval;
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+
+ /*
+ * Writing 0 to dirty_writeback_interval will disable periodic writeback
+ * and a different non-zero value will wakeup the writeback threads.
+ * wb_wakeup_delayed() would be more appropriate, but it's a pain to
+ * iterate over all bdis and wbs.
+ * The reason we do this is to make the change take effect immediately.
+ */
+ if (!ret && write && dirty_writeback_interval &&
+ dirty_writeback_interval != old_interval)
+ wakeup_flusher_threads(WB_REASON_PERIODIC);
+
+ return ret;
+}
+
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(struct timer_list *t)
+{
+ struct backing_dev_info *backing_dev_info =
+ from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+
+ wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
+}
+
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now. If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(struct backing_dev_info *info)
+{
+ mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ del_timer(&bdi->laptop_mode_wb_timer);
+
+ rcu_read_unlock();
+}
+#endif
+
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive)
+ * get_writeback_state too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds.
+ */
+
+void writeback_set_ratelimit(void)
+{
+ struct wb_domain *dom = &global_wb_domain;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ dom->dirty_limit = dirty_thresh;
+ ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
+ if (ratelimit_pages < 16)
+ ratelimit_pages = 16;
+}
+
+static int page_writeback_cpu_online(unsigned int cpu)
+{
+ writeback_set_ratelimit();
+ return 0;
+}
+
+/*
+ * Called early on to tune the page writeback dirty limits.
+ *
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
+ */
+void __init page_writeback_init(void)
+{
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
+
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
+ page_writeback_cpu_online, NULL);
+ cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
+ page_writeback_cpu_online);
+}
+
+/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ unsigned int tagged = 0;
+ void *page;
+
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
+ *
+ * Return: %0 on success, negative error code otherwise
+ */
+int write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ int ret = 0;
+ int done = 0;
+ int error;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int range_whole = 0;
+ xa_mark_t tag;
+
+ pagevec_init(&pvec);
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index; /* prev offset */
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
+ tag_pages_for_writeback(mapping, index, end);
+ tag = PAGECACHE_TAG_TOWRITE;
+ } else {
+ tag = PAGECACHE_TAG_DIRTY;
+ }
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ done_index = page->index;
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ }
+ if (!ret)
+ ret = error;
+ }
+
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
+
+ return ret;
+}
+EXPORT_SYMBOL(write_cache_pages);
+
+/*
+ * Function used by generic_writepages to call the real writepage
+ * function and set the mapping flags on error
+ */
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+/**
+ * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * Return: %0 on success, negative error code otherwise
+ */
+int generic_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct blk_plug plug;
+ int ret;
+
+ /* deal with chardevs and other special file */
+ if (!mapping->a_ops->writepage)
+ return 0;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ int ret;
+
+ if (wbc->nr_to_write <= 0)
+ return 0;
+ while (1) {
+ if (mapping->a_ops->writepages)
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
+ break;
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ }
+ return ret;
+}
+
+/**
+ * write_one_page - write out a single page and wait on I/O
+ * @page: the page to write
+ *
+ * The page must be locked by the caller and will be unlocked upon return.
+ *
+ * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
+ * function returns.
+ *
+ * Return: %0 on success, negative error code otherwise
+ */
+int write_one_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ int ret = 0;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ };
+
+ BUG_ON(!PageLocked(page));
+
+ wait_on_page_writeback(page);
+
+ if (clear_page_dirty_for_io(page)) {
+ get_page(page);
+ ret = mapping->a_ops->writepage(page, &wbc);
+ if (ret == 0)
+ wait_on_page_writeback(page);
+ put_page(page);
+ } else {
+ unlock_page(page);
+ }
+
+ if (!ret)
+ ret = filemap_check_errors(mapping);
+ return ret;
+}
+EXPORT_SYMBOL(write_one_page);
+
+/*
+ * For address_spaces which do not use buffers nor write back.
+ */
+int __set_page_dirty_no_writeback(struct page *page)
+{
+ if (!PageDirty(page))
+ return !TestSetPageDirty(page);
+ return 0;
+}
+
+/*
+ * Helper function for set_page_dirty family.
+ *
+ * Caller must hold lock_page_memcg().
+ *
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+
+ trace_writeback_dirty_page(page, mapping);
+
+ if (mapping_can_writeback(mapping)) {
+ struct bdi_writeback *wb;
+
+ inode_attach_wb(inode, page);
+ wb = inode_to_wb(inode);
+
+ __inc_lruvec_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ __inc_node_page_state(page, NR_DIRTIED);
+ inc_wb_stat(wb, WB_RECLAIMABLE);
+ inc_wb_stat(wb, WB_DIRTIED);
+ task_io_account_write(PAGE_SIZE);
+ current->nr_dirtied++;
+ this_cpu_inc(bdp_ratelimits);
+
+ mem_cgroup_track_foreign_dirty(page, wb);
+ }
+}
+
+/*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Caller must hold lock_page_memcg().
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+ struct bdi_writeback *wb)
+{
+ if (mapping_can_writeback(mapping)) {
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ dec_wb_stat(wb, WB_RECLAIMABLE);
+ task_io_account_cancelled_write(PAGE_SIZE);
+ }
+}
+
+/*
+ * For address_spaces which do not use buffers. Just tag the page as dirty in
+ * the xarray.
+ *
+ * This is also used when a single buffer is being dirtied: we want to set the
+ * page dirty in that case, but not all the buffers. This is a "bottom-up"
+ * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * The caller must ensure this doesn't race with truncation. Most will simply
+ * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
+ * the pte lock held, which also locks out truncation.
+ */
+int __set_page_dirty_nobuffers(struct page *page)
+{
+ lock_page_memcg(page);
+ if (!TestSetPageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+ unsigned long flags;
+
+ if (!mapping) {
+ unlock_page_memcg(page);
+ return 1;
+ }
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ BUG_ON(page_mapping(page) != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+ unlock_page_memcg(page);
+
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return 1;
+ }
+ unlock_page_memcg(page);
+ return 0;
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+/*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+void account_page_redirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ if (mapping && mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ current->nr_dirtied--;
+ dec_node_page_state(page, NR_DIRTIED);
+ dec_wb_stat(wb, WB_DIRTIED);
+ unlocked_inode_to_wb_end(inode, &cookie);
+ }
+}
+EXPORT_SYMBOL(account_page_redirty);
+
+/*
+ * When a writepage implementation decides that it doesn't want to write this
+ * page for some reason, it should redirty the locked page via
+ * redirty_page_for_writepage() and it should then unlock the page and return 0
+ */
+int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+{
+ int ret;
+
+ wbc->pages_skipped++;
+ ret = __set_page_dirty_nobuffers(page);
+ account_page_redirty(page);
+ return ret;
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+/*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ page = compound_head(page);
+ if (likely(mapping)) {
+ int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+ /*
+ * readahead/lru_deactivate_page could remain
+ * PG_readahead/PG_reclaim due to race with end_page_writeback
+ * About readahead, if the page is written, the flags would be
+ * reset. So no problem.
+ * About lru_deactivate_page, if the page is redirty, the flag
+ * will be reset. So no problem. but if the page is used by readahead
+ * it will confuse readahead and make it restart the size rampup
+ * process. But it's a trivial problem.
+ */
+ if (PageReclaim(page))
+ ClearPageReclaim(page);
+#ifdef CONFIG_BLOCK
+ if (!spd)
+ spd = __set_page_dirty_buffers;
+#endif
+ return (*spd)(page);
+ }
+ if (!PageDirty(page)) {
+ if (!TestSetPageDirty(page))
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+/*
+ * set_page_dirty() is racy if the caller has no reference against
+ * page->mapping->host, and if the page is unlocked. This is because another
+ * CPU could truncate the page off the mapping and then free the mapping.
+ *
+ * Usually, the page _is_ locked, or the caller is a user-space process which
+ * holds a reference on the inode by having an open file.
+ *
+ * In other cases, the page should be locked before running set_page_dirty().
+ */
+int set_page_dirty_lock(struct page *page)
+{
+ int ret;
+
+ lock_page(page);
+ ret = set_page_dirty(page);
+ unlock_page(page);
+ return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_lock);
+
+/*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void __cancel_dirty_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+
+ lock_page_memcg(page);
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping, wb);
+
+ unlocked_inode_to_wb_end(inode, &cookie);
+ unlock_page_memcg(page);
+ } else {
+ ClearPageDirty(page);
+ }
+}
+EXPORT_SYMBOL(__cancel_dirty_page);
+
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout. We leave the page
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and xarray dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and xarray tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int ret = 0;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (mapping && mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+
+ /*
+ * Yes, Virginia, this is indeed insane.
+ *
+ * We use this sequence to make sure that
+ * (a) we account for dirty stats properly
+ * (b) we tell the low-level filesystem to
+ * mark the whole page dirty if it was
+ * dirty in a pagetable. Only to then
+ * (c) clean the page again and return 1 to
+ * cause the writeback.
+ *
+ * This way we avoid all nasty races with the
+ * dirty bit in multiple places and clearing
+ * them concurrently from different threads.
+ *
+ * Note! Normally the "set_page_dirty(page)"
+ * has no effect on the actual dirty bit - since
+ * that will already usually be set. But we
+ * need the side effects, and it can help us
+ * avoid races.
+ *
+ * We basically use the page "master dirty bit"
+ * as a serialization point for all the different
+ * threads doing their things.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ /*
+ * We carefully synchronise fault handlers against
+ * installing a dirty pte and marking the page dirty
+ * at this point. We do this by having them hold the
+ * page lock while dirtying the page, and pages are
+ * always locked coming in here, so we get the desired
+ * exclusion.
+ */
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ if (TestClearPageDirty(page)) {
+ dec_lruvec_page_state(page, NR_FILE_DIRTY);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ dec_wb_stat(wb, WB_RECLAIMABLE);
+ ret = 1;
+ }
+ unlocked_inode_to_wb_end(inode, &cookie);
+ return ret;
+ }
+ return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+int test_clear_page_writeback(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int ret;
+
+ memcg = lock_page_memcg(page);
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (mapping && mapping_use_writeback_tags(mapping)) {
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ unsigned long flags;
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ ret = TestClearPageWriteback(page);
+ if (ret) {
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_WRITEBACK);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ dec_wb_stat(wb, WB_WRITEBACK);
+ __wb_writeout_inc(wb);
+ }
+ }
+
+ if (mapping->host && !mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK))
+ sb_clear_inode_writeback(mapping->host);
+
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+ } else {
+ ret = TestClearPageWriteback(page);
+ }
+ if (ret) {
+ dec_lruvec_state(lruvec, NR_WRITEBACK);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ inc_node_page_state(page, NR_WRITTEN);
+ }
+ __unlock_page_memcg(memcg);
+ return ret;
+}
+
+int __test_set_page_writeback(struct page *page, bool keep_write)
+{
+ struct address_space *mapping = page_mapping(page);
+ int ret, access_ret;
+
+ lock_page_memcg(page);
+ if (mapping && mapping_use_writeback_tags(mapping)) {
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ ret = TestSetPageWriteback(page);
+ if (!ret) {
+ bool on_wblist;
+
+ on_wblist = mapping_tagged(mapping,
+ PAGECACHE_TAG_WRITEBACK);
+
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
+ inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+
+ /*
+ * We can come through here when swapping anonymous
+ * pages, so we don't necessarily have an inode to track
+ * for sync.
+ */
+ if (mapping->host && !on_wblist)
+ sb_mark_inode_writeback(mapping->host);
+ }
+ if (!PageDirty(page))
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ if (!keep_write)
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+ } else {
+ ret = TestSetPageWriteback(page);
+ }
+ if (!ret) {
+ inc_lruvec_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ }
+ unlock_page_memcg(page);
+ access_ret = arch_make_page_accessible(page);
+ /*
+ * If writeback has been triggered on a page that cannot be made
+ * accessible, it is too late to recover here.
+ */
+ VM_BUG_ON_PAGE(access_ret != 0, page);
+
+ return ret;
+
+}
+EXPORT_SYMBOL(__test_set_page_writeback);
+
+/*
+ * Wait for a page to complete writeback
+ */
+void wait_on_page_writeback(struct page *page)
+{
+ while (PageWriteback(page)) {
+ trace_wait_on_page_writeback(page, page_mapping(page));
+ wait_on_page_bit(page, PG_writeback);
+ }
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+/**
+ * wait_for_stable_page() - wait for writeback to finish, if necessary.
+ * @page: The page to wait on.
+ *
+ * This function determines if the given page is related to a backing device
+ * that requires page contents to be held stable during writeback. If so, then
+ * it will wait for any pending writeback to complete.
+ */
+void wait_for_stable_page(struct page *page)
+{
+ page = thp_head(page);
+ if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
+ wait_on_page_writeback(page);
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
new file mode 100644
index 000000000..124ab9324
--- /dev/null
+++ b/mm/page_alloc.c
@@ -0,0 +1,8997 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/page_alloc.c
+ *
+ * Manages the free list, the system allocates free pages here.
+ * Note that kmalloc() lives in slab.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/jiffies.h>
+#include <linux/memblock.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/oom.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nodemask.h>
+#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
+#include <linux/mempolicy.h>
+#include <linux/memremap.h>
+#include <linux/stop_machine.h>
+#include <linux/random.h>
+#include <linux/sort.h>
+#include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
+#include <linux/page-isolation.h>
+#include <linux/debugobjects.h>
+#include <linux/kmemleak.h>
+#include <linux/compaction.h>
+#include <trace/events/kmem.h>
+#include <trace/events/oom.h>
+#include <linux/prefetch.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/mm.h>
+#include <linux/page_owner.h>
+#include <linux/kthread.h>
+#include <linux/memcontrol.h>
+#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
+#include <linux/psi.h>
+#include <linux/padata.h>
+#include <linux/khugepaged.h>
+
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+#include "internal.h"
+#include "shuffle.h"
+#include "page_reporting.h"
+
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
+/* work_structs for global per-cpu drains */
+struct pcpu_drain {
+ struct zone *zone;
+ struct work_struct work;
+};
+static DEFINE_MUTEX(pcpu_drain_mutex);
+static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
+
+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
+volatile unsigned long latent_entropy __latent_entropy;
+EXPORT_SYMBOL(latent_entropy);
+#endif
+
+/*
+ * Array of node states.
+ */
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
+ [N_POSSIBLE] = NODE_MASK_ALL,
+ [N_ONLINE] = { { [0] = 1UL } },
+#ifndef CONFIG_NUMA
+ [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+ [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+ [N_MEMORY] = { { [0] = 1UL } },
+ [N_CPU] = { { [0] = 1UL } },
+#endif /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
+
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
+unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
+
+int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+EXPORT_SYMBOL(init_on_alloc);
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_free);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_free);
+#endif
+EXPORT_SYMBOL(init_on_free);
+
+static int __init early_init_on_alloc(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
+ if (bool_result)
+ static_branch_enable(&init_on_alloc);
+ else
+ static_branch_disable(&init_on_alloc);
+ return 0;
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static int __init early_init_on_free(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
+ if (bool_result)
+ static_branch_enable(&init_on_free);
+ else
+ static_branch_disable(&init_on_free);
+ return 0;
+}
+early_param("init_on_free", early_init_on_free);
+
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ return false;
+ return true;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+unsigned int pageblock_order __read_mostly;
+#endif
+
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
+
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ * 1G machine -> (16M dma, 784M normal, 224M high)
+ * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
+ [ZONE_DMA] = 256,
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ [ZONE_DMA32] = 256,
+#endif
+ [ZONE_NORMAL] = 32,
+#ifdef CONFIG_HIGHMEM
+ [ZONE_HIGHMEM] = 0,
+#endif
+ [ZONE_MOVABLE] = 0,
+};
+
+static char * const zone_names[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
+ "DMA",
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ "DMA32",
+#endif
+ "Normal",
+#ifdef CONFIG_HIGHMEM
+ "HighMem",
+#endif
+ "Movable",
+#ifdef CONFIG_ZONE_DEVICE
+ "Device",
+#endif
+};
+
+const char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+ [NULL_COMPOUND_DTOR] = NULL,
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
+#ifdef CONFIG_HUGETLB_PAGE
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
+#endif
+};
+
+int min_free_kbytes = 1024;
+int user_min_free_kbytes = -1;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
+ * are not on separate NUMA nodes. Functionally this works but with
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
+ * quite small. By default, do not boost watermarks on discontigmem as in
+ * many cases very high-order allocations like THP are likely to be
+ * unsupported and the premature reclaim offsets the advantage of long-term
+ * fragmentation avoidance.
+ */
+int watermark_boost_factor __read_mostly;
+#else
+int watermark_boost_factor __read_mostly = 15000;
+#endif
+int watermark_scale_factor = 10;
+
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
+
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long required_kernelcore __initdata;
+static unsigned long required_kernelcore_percent __initdata;
+static unsigned long required_movablecore __initdata;
+static unsigned long required_movablecore_percent __initdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
+static bool mirrored_kernelcore __meminitdata;
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+
+#if MAX_NUMNODES > 1
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
+EXPORT_SYMBOL(nr_node_ids);
+EXPORT_SYMBOL(nr_online_nodes);
+#endif
+
+int page_group_by_mobility_disabled __read_mostly;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+ if (!static_branch_unlikely(&deferred_pages))
+ kasan_free_pages(page, order);
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+{
+ int nid = early_pfn_to_nid(pfn);
+
+ if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns true when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
+ /* Always populate low zones for address-constrained allocations */
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+ return false;
+
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+ return true;
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ nr_initialised++;
+ if ((nr_initialised > PAGES_PER_SECTION) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
+ }
+ return false;
+}
+#else
+#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+ return false;
+}
+
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
+{
+ return false;
+}
+#endif
+
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return section_to_usemap(__pfn_to_section(pfn));
+#else
+ return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+#else
+ pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+#endif /* CONFIG_SPARSEMEM */
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline
+unsigned long __get_pfnblock_flags_mask(struct page *page,
+ unsigned long pfn,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ word = bitmap[word_bitidx];
+ return (word >> bitidx) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long mask)
+{
+ return __get_pfnblock_flags_mask(page, pfn, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long mask)
+{
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ mask <<= bitidx;
+ flags <<= bitidx;
+
+ word = READ_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
+}
+
+void set_pageblock_migratetype(struct page *page, int migratetype)
+{
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
+ page_to_pfn(page), MIGRATETYPE_MASK);
+}
+
+#ifdef CONFIG_DEBUG_VM
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
+{
+ int ret = 0;
+ unsigned seq;
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long sp, start_pfn;
+
+ do {
+ seq = zone_span_seqbegin(zone);
+ start_pfn = zone->zone_start_pfn;
+ sp = zone->spanned_pages;
+ if (!zone_spans_pfn(zone, pfn))
+ ret = 1;
+ } while (zone_span_seqretry(zone, seq));
+
+ if (ret)
+ pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
+ pfn, zone_to_nid(zone), zone->name,
+ start_pfn, start_pfn + sp);
+
+ return ret;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
+ if (!pfn_valid_within(page_to_pfn(page)))
+ return 0;
+ if (zone != page_zone(page))
+ return 0;
+
+ return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
+{
+ if (page_outside_zone_boundaries(zone, page))
+ return 1;
+ if (!page_is_consistent(zone, page))
+ return 1;
+
+ return 0;
+}
+#else
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
+{
+ return 0;
+}
+#endif
+
+static void bad_page(struct page *page, const char *reason)
+{
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
+
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ goto out;
+ }
+ if (nr_unshown) {
+ pr_alert(
+ "BUG: Bad page state: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ __dump_page(page, reason);
+ dump_page_owner(page);
+
+ print_modules();
+ dump_stack();
+out:
+ /* Leave bad fields for debug, except PageBuddy could make trouble */
+ page_mapcount_reset(page); /* remove PageBuddy */
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+}
+
+/*
+ * Higher-order pages are called "compound pages". They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page" and have PG_head set.
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+ * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
+ *
+ * The first tail page's ->compound_dtor holds the offset in array of compound
+ * page destructors. See compound_page_dtors.
+ *
+ * The first tail page's ->compound_order holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
+ */
+
+void free_compound_page(struct page *page)
+{
+ mem_cgroup_uncharge(page);
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
+}
+
+void prep_compound_page(struct page *page, unsigned int order)
+{
+ int i;
+ int nr_pages = 1 << order;
+
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
+ struct page *p = page + i;
+ set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, page);
+ }
+
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ set_compound_order(page, order);
+ atomic_set(compound_mapcount_ptr(page), -1);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+void init_debug_pagealloc(void)
+{
+ if (!debug_pagealloc_enabled())
+ return;
+
+ static_branch_enable(&_debug_pagealloc_enabled);
+
+ if (!debug_guardpage_minorder())
+ return;
+
+ static_branch_enable(&_debug_guardpage_enabled);
+}
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ pr_err("Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
+{
+ if (!debug_guardpage_enabled())
+ return false;
+
+ if (order >= debug_guardpage_minorder())
+ return false;
+
+ __SetPageGuard(page);
+ INIT_LIST_HEAD(&page->lru);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
+}
+
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype)
+{
+ if (!debug_guardpage_enabled())
+ return;
+
+ __ClearPageGuard(page);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
+}
+#else
+static inline bool set_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) { return false; }
+static inline void clear_page_guard(struct zone *zone, struct page *page,
+ unsigned int order, int migratetype) {}
+#endif
+
+static inline void set_buddy_order(struct page *page, unsigned int order)
+{
+ set_page_private(page, order);
+ __SetPageBuddy(page);
+}
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can coalesce a page and its buddy if
+ * (a) the buddy is not in a hole (check before calling!) &&
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
+ *
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
+ */
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
+ unsigned int order)
+{
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
+ return false;
+
+ if (buddy_order(buddy) != order)
+ return false;
+
+ /*
+ * zone check is done late to avoid uselessly calculating
+ * zone/node ids for pages that could never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return false;
+
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ return true;
+}
+
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return unlikely(capc) &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
+/* Used for pages not on another list */
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_add_tail(&page->lru, &area->free_list[migratetype]);
+ area->nr_free++;
+}
+
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
+static inline void move_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ struct free_area *area = &zone->free_area[order];
+
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ unsigned int order)
+{
+ /* clear reported state and update reported page count */
+ if (page_reported(page))
+ __ClearPageReported(page);
+
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ zone->free_area[order].nr_free--;
+}
+
+/*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+static inline bool
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
+ struct page *page, unsigned int order)
+{
+ struct page *higher_page, *higher_buddy;
+ unsigned long combined_pfn;
+
+ if (order >= MAX_ORDER - 2)
+ return false;
+
+ if (!pfn_valid_within(buddy_pfn))
+ return false;
+
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+
+ return pfn_valid_within(buddy_pfn) &&
+ page_is_buddy(higher_page, higher_buddy, order + 1);
+}
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+ * free pages of length of (1 << order) and marked with PageBuddy.
+ * Page's order is recorded in page_private(page) field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.
+ *
+ * -- nyc
+ */
+
+static inline void __free_one_page(struct page *page,
+ unsigned long pfn,
+ struct zone *zone, unsigned int order,
+ int migratetype, fpi_t fpi_flags)
+{
+ struct capture_control *capc = task_capc(zone);
+ unsigned long buddy_pfn;
+ unsigned long combined_pfn;
+ unsigned int max_order;
+ struct page *buddy;
+ bool to_tail;
+
+ max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
+
+ VM_BUG_ON(!zone_is_initialized(zone));
+ VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+
+ VM_BUG_ON(migratetype == -1);
+ if (likely(!is_migrate_isolate(migratetype)))
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
+
+ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
+
+continue_merging:
+ while (order < max_order) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (!pfn_valid_within(buddy_pfn))
+ goto done_merging;
+ if (!page_is_buddy(page, buddy, order))
+ goto done_merging;
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy))
+ clear_page_guard(zone, buddy, order, migratetype);
+ else
+ del_page_from_free_list(buddy, zone, order);
+ combined_pfn = buddy_pfn & pfn;
+ page = page + (combined_pfn - pfn);
+ pfn = combined_pfn;
+ order++;
+ }
+ if (order < MAX_ORDER - 1) {
+ /* If we are here, it means order is >= pageblock_order.
+ * We want to prevent merge between freepages on isolate
+ * pageblock and normal pageblock. Without this, pageblock
+ * isolation could cause incorrect freepage or CMA accounting.
+ *
+ * We don't want to hit this code for the more frequent
+ * low-order merging.
+ */
+ if (unlikely(has_isolate_pageblock(zone))) {
+ int buddy_mt;
+
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+ buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (is_migrate_isolate(migratetype) ||
+ is_migrate_isolate(buddy_mt)))
+ goto done_merging;
+ }
+ max_order = order + 1;
+ goto continue_merging;
+ }
+
+done_merging:
+ set_buddy_order(page, order);
+
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
+ to_tail = shuffle_pick_tail();
+ else
+ to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+
+ if (to_tail)
+ add_to_free_list_tail(page, zone, order, migratetype);
+ else
+ add_to_free_list(page, zone, order, migratetype);
+
+ /* Notify page reporting subsystem of freed page */
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
+ page_reporting_notify_free(order);
+}
+
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+ unsigned long check_flags)
+{
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
+ return false;
+
+ if (unlikely((unsigned long)page->mapping |
+ page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+ (unsigned long)page->mem_cgroup |
+#endif
+ (page->flags & check_flags)))
+ return false;
+
+ return true;
+}
+
+static const char *page_bad_reason(struct page *page, unsigned long flags)
+{
+ const char *bad_reason = NULL;
+
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(page_ref_count(page) != 0))
+ bad_reason = "nonzero _refcount";
+ if (unlikely(page->flags & flags)) {
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
+ else
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+ }
+#ifdef CONFIG_MEMCG
+ if (unlikely(page->mem_cgroup))
+ bad_reason = "page still charged to cgroup";
+#endif
+ return bad_reason;
+}
+
+static void check_free_page_bad(struct page *page)
+{
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
+}
+
+static inline int check_free_page(struct page *page)
+{
+ if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
+ return 0;
+
+ /* Something has gone sideways, find it */
+ check_free_page_bad(page);
+ return 1;
+}
+
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
+ }
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping may be compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount");
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * deferred_list.next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page");
+ goto out;
+ }
+ break;
+ }
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set");
+ goto out;
+ }
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent");
+ goto out;
+ }
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
+}
+
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
+
+ /* s390's use of memset() could override KASAN redzones. */
+ kasan_disable_current();
+ for (i = 0; i < numpages; i++)
+ clear_highpage(page + i);
+ kasan_enable_current();
+}
+
+static __always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order, bool check_free)
+{
+ int bad = 0;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ trace_mm_page_free(page, order);
+
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
+ /*
+ * Check tail pages before head page information is cleared to
+ * avoid checking PageCompound for order-0 pages.
+ */
+ if (unlikely(order)) {
+ bool compound = PageCompound(page);
+ int i;
+
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+ if (compound)
+ ClearPageDoubleMap(page);
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
+ if (unlikely(check_free_page(page + i))) {
+ bad++;
+ continue;
+ }
+ (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ }
+ if (PageMappingFlags(page))
+ page->mapping = NULL;
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ if (check_free)
+ bad += check_free_page(page);
+ if (bad)
+ return false;
+
+ page_cpupid_reset_last(page);
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ reset_page_owner(page, order);
+
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE << order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ if (want_init_on_free())
+ kernel_init_free_pages(page, 1 << order);
+
+ kernel_poison_pages(page, 1 << order, 0);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 0);
+
+ kasan_free_nondeferred_pages(page, order);
+
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_VM
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
+ * moved from pcp lists to free lists.
+ */
+static bool free_pcp_prepare(struct page *page)
+{
+ return free_pages_prepare(page, 0, true);
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+ if (debug_pagealloc_enabled_static())
+ return check_free_page(page);
+ else
+ return false;
+}
+#else
+/*
+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
+ * moving from pcp lists to free list in order to reduce overhead. With
+ * debug_pagealloc enabled, they are checked also immediately when being freed
+ * to the pcp lists.
+ */
+static bool free_pcp_prepare(struct page *page)
+{
+ if (debug_pagealloc_enabled_static())
+ return free_pages_prepare(page, 0, true);
+ else
+ return free_pages_prepare(page, 0, false);
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+ return check_free_page(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static inline void prefetch_buddy(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
+ struct page *buddy = page + (buddy_pfn - pfn);
+
+ prefetch(buddy);
+}
+
+/*
+ * Frees a number of pages from the PCP lists
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static void free_pcppages_bulk(struct zone *zone, int count,
+ struct per_cpu_pages *pcp)
+{
+ int migratetype = 0;
+ int batch_free = 0;
+ int prefetch_nr = 0;
+ bool isolated_pageblocks;
+ struct page *page, *tmp;
+ LIST_HEAD(head);
+
+ /*
+ * Ensure proper count is passed which otherwise would stuck in the
+ * below while (list_empty(list)) loop.
+ */
+ count = min(pcp->count, count);
+ while (count) {
+ struct list_head *list;
+
+ /*
+ * Remove pages from lists in a round-robin fashion. A
+ * batch_free count is maintained that is incremented when an
+ * empty list is encountered. This is so more pages are freed
+ * off fuller lists instead of spinning excessively around empty
+ * lists
+ */
+ do {
+ batch_free++;
+ if (++migratetype == MIGRATE_PCPTYPES)
+ migratetype = 0;
+ list = &pcp->lists[migratetype];
+ } while (list_empty(list));
+
+ /* This is the only non-empty list. Free them all. */
+ if (batch_free == MIGRATE_PCPTYPES)
+ batch_free = count;
+
+ do {
+ page = list_last_entry(list, struct page, lru);
+ /* must delete to avoid corrupting pcp list */
+ list_del(&page->lru);
+ pcp->count--;
+
+ if (bulkfree_pcp_prepare(page))
+ continue;
+
+ list_add_tail(&page->lru, &head);
+
+ /*
+ * We are going to put the page back to the global
+ * pool, prefetch its buddy to speed up later access
+ * under zone->lock. It is believed the overhead of
+ * an additional test and calculating buddy_pfn here
+ * can be offset by reduced memory latency later. To
+ * avoid excessive prefetching due to large count, only
+ * prefetch buddy for the first pcp->batch nr of pages.
+ */
+ if (prefetch_nr++ < pcp->batch)
+ prefetch_buddy(page);
+ } while (--count && --batch_free && !list_empty(list));
+ }
+
+ spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
+ /*
+ * Use safe version since after __free_one_page(),
+ * page->lru.next will not point to original list.
+ */
+ list_for_each_entry_safe(page, tmp, &head, lru) {
+ int mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
+
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ }
+ spin_unlock(&zone->lock);
+}
+
+static void free_one_page(struct zone *zone,
+ struct page *page, unsigned long pfn,
+ unsigned int order,
+ int migratetype, fpi_t fpi_flags)
+{
+ spin_lock(&zone->lock);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ spin_unlock(&zone->lock);
+}
+
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ mm_zero_struct_page(page);
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __meminit init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (!early_page_uninitialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+
+ /* Avoid false-positive PageTail() */
+ INIT_LIST_HEAD(&page->lru);
+
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
+ }
+ }
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
+{
+ unsigned long flags;
+ int migratetype;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_pages_prepare(page, order, true))
+ return;
+
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ local_irq_save(flags);
+ __count_vm_events(PGFREE, 1 << order);
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
+ local_irq_restore(flags);
+}
+
+void __free_pages_core(struct page *page, unsigned int order)
+{
+ unsigned int nr_pages = 1 << order;
+ struct page *p = page;
+ unsigned int loop;
+
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
+ prefetchw(p);
+ for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
+}
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
+{
+ unsigned long start_pfn, end_pfn;
+ int nid;
+
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
+
+ return nid;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ static DEFINE_SPINLOCK(early_pfn_lock);
+ int nid;
+
+ spin_lock(&early_pfn_lock);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
+}
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+ if (early_page_uninitialised(pfn))
+ return;
+ __free_pages_core(page, order);
+}
+
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_online_page(start_pfn);
+ if (!start_page)
+ return NULL;
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ struct page *page;
+ unsigned long i;
+
+ if (!nr_pages)
+ return;
+
+ page = pfn_to_page(pfn);
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == pageblock_nr_pages &&
+ (pfn & (pageblock_nr_pages - 1)) == 0) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_core(page, pageblock_order);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++) {
+ if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_core(page, 0);
+ }
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
+
+/*
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * First we check if pfn is valid on architectures where it is possible to have
+ * holes within pageblock_nr_pages. On systems where it is not possible, this
+ * function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the validity
+ * of the head pfn.
+ */
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
+{
+ if (!pfn_valid_within(pfn))
+ return false;
+ if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+ return false;
+ return true;
+}
+
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * pageblock_nr_pages sizes.
+ */
+static void __init deferred_free_pages(unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long nr_free = 0;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 0;
+ } else if (!(pfn & nr_pgmask)) {
+ deferred_free_range(pfn - nr_free, nr_free);
+ nr_free = 1;
+ } else {
+ nr_free++;
+ }
+ }
+ /* Free the last block of pages to allocator */
+ deferred_free_range(pfn - nr_free, nr_free);
+}
+
+/*
+ * Initialize struct pages. We minimize pfn page lookups and scheduler checks
+ * by performing it only once every pageblock_nr_pages.
+ * Return number of pages initialized.
+ */
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
+ unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
+ struct page *page = NULL;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!deferred_pfn_valid(pfn)) {
+ page = NULL;
+ continue;
+ } else if (!page || !(pfn & nr_pgmask)) {
+ page = pfn_to_page(pfn);
+ } else {
+ page++;
+ }
+ __init_single_page(page, pfn, zid, nid);
+ nr_pages++;
+ }
+ return (nr_pages);
+}
+
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn;
+ struct zone *zone = arg;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0;
+ unsigned long first_init_pfn, flags;
+ unsigned long start = jiffies;
+ struct zone *zone;
+ int zid, max_threads;
+ u64 i;
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ pgdat_resize_lock(pgdat, &flags);
+ first_init_pfn = pgdat->first_deferred_pfn;
+ if (first_init_pfn == ULONG_MAX) {
+ pgdat_resize_unlock(pgdat, &flags);
+ pgdat_init_report_one_done();
+ return 0;
+ }
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
+
+ max_threads = deferred_page_init_max_threads(cpumask);
+
+ while (spfn < epfn) {
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = zone,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ epfn_align);
+ }
+zone_empty:
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d deferred pages initialised in %ums\n",
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
+
+ pgdat_init_report_one_done();
+ return 0;
+}
+
+/*
+ * If this zone has deferred pages, try to grow it by initializing enough
+ * deferred pages to satisfy the allocation specified by order, rounded up to
+ * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
+ * of SECTION_SIZE bytes by initializing struct pages in increments of
+ * PAGES_PER_SECTION * sizeof(struct page) bytes.
+ *
+ * Return true when zone was grown, otherwise return false. We return true even
+ * when we grow less than requested, to let the caller decide if there are
+ * enough pages to satisfy the allocation.
+ *
+ * Note: We use noinline because this function is needed only during boot, and
+ * it is called from a __ref function _deferred_grow_zone. This way we are
+ * making sure that it is not inlined into permanent text section.
+ */
+static noinline bool __init
+deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ pg_data_t *pgdat = zone->zone_pgdat;
+ unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
+ u64 i;
+
+ /* Only the last zone may have deferred pages */
+ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
+ return false;
+
+ pgdat_resize_lock(pgdat, &flags);
+
+ /*
+ * If someone grew this zone while we were waiting for spinlock, return
+ * true, as there might be enough pages already.
+ */
+ if (first_deferred_pfn != pgdat->first_deferred_pfn) {
+ pgdat_resize_unlock(pgdat, &flags);
+ return true;
+ }
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
+ pgdat_resize_unlock(pgdat, &flags);
+ /* Retry only once. */
+ return first_deferred_pfn != ULONG_MAX;
+ }
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
+
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
+
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
+ if (nr_pages >= nr_pages_needed)
+ break;
+ }
+
+ pgdat->first_deferred_pfn = spfn;
+ pgdat_resize_unlock(pgdat, &flags);
+
+ return nr_pages > 0;
+}
+
+/*
+ * deferred_grow_zone() is __init, but it is called from
+ * get_page_from_freelist() during early boot until deferred_pages permanently
+ * disables this call. This is why we have refdata wrapper to avoid warning,
+ * and to ensure that the function body gets unloaded.
+ */
+static bool __ref
+_deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ return deferred_grow_zone(zone, order);
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+void __init page_alloc_init_late(void)
+{
+ struct zone *zone;
+ int nid;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+ for_each_node_state(nid, N_MEMORY) {
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
+ * We initialized the rest of the deferred pages. Permanently disable
+ * on-demand struct page initialization.
+ */
+ static_branch_disable(&deferred_pages);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
+#endif
+
+ /* Discard memblock private memory */
+ memblock_discard();
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
+}
+
+#ifdef CONFIG_CMA
+/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+
+ if (pageblock_order >= MAX_ORDER) {
+ i = pageblock_nr_pages;
+ p = page;
+ do {
+ set_page_refcounted(p);
+ __free_pages(p, MAX_ORDER - 1);
+ p += MAX_ORDER_NR_PAGES;
+ } while (i -= MAX_ORDER_NR_PAGES);
+ } else {
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+ }
+
+ adjust_managed_page_count(page, pageblock_nr_pages);
+}
+#endif
+
+/*
+ * The order of subdivision here is critical for the IO subsystem.
+ * Please do not alter this order without good reasons and regression
+ * testing. Specifically, as large blocks of memory are subdivided,
+ * the order in which smaller blocks are delivered depends on the order
+ * they're subdivided in this function. This is the primary factor
+ * influencing the order in which pages are delivered to the IO
+ * subsystem according to empirical testing, and this is also justified
+ * by considering the behavior of a buddy system containing a single
+ * large block of memory acted on by a series of small allocations.
+ * This behavior is a critical factor in sglist merging's success.
+ *
+ * -- nyc
+ */
+static inline void expand(struct zone *zone, struct page *page,
+ int low, int high, int migratetype)
+{
+ unsigned long size = 1 << high;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+ VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
+
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ if (set_page_guard(zone, &page[size], high, migratetype))
+ continue;
+
+ add_to_free_list(&page[size], zone, high, migratetype);
+ set_buddy_order(&page[size], high);
+ }
+}
+
+static void check_new_page_bad(struct page *page)
+{
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ /* Don't complain about hwpoisoned pages */
+ page_mapcount_reset(page); /* remove PageBuddy */
+ return;
+ }
+
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+ if (likely(page_expected_state(page,
+ PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+ return 0;
+
+ check_new_page_bad(page);
+ return 1;
+}
+
+static inline bool free_pages_prezeroed(void)
+{
+ return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled()) || want_init_on_free();
+}
+
+#ifdef CONFIG_DEBUG_VM
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
+ * also checked when pcp lists are refilled from the free lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
+{
+ if (debug_pagealloc_enabled_static())
+ return check_new_page(page);
+ else
+ return false;
+}
+
+static inline bool check_new_pcp(struct page *page)
+{
+ return check_new_page(page);
+}
+#else
+/*
+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
+ * enabled, they are also checked when being allocated from the pcp lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
+{
+ return check_new_page(page);
+}
+static inline bool check_new_pcp(struct page *page)
+{
+ if (debug_pagealloc_enabled_static())
+ return check_new_page(page);
+ else
+ return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
+inline void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
+{
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
+ set_page_owner(page, order, gfp_flags);
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ unsigned int alloc_flags)
+{
+ post_alloc_hook(page, order, gfp_flags);
+
+ if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
+ kernel_init_free_pages(page, 1 << order);
+
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
+
+ /*
+ * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
+ * allocate the page. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the page
+ * being used for !PFMEMALLOC purposes.
+ */
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ set_page_pfmemalloc(page);
+ else
+ clear_page_pfmemalloc(page);
+}
+
+/*
+ * Go through the free lists for the given migratetype and remove
+ * the smallest available page from the freelists
+ */
+static __always_inline
+struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ unsigned int current_order;
+ struct free_area *area;
+ struct page *page;
+
+ /* Find a page of the appropriate size in the preferred list */
+ for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+ area = &(zone->free_area[current_order]);
+ page = get_page_from_free_area(area, migratetype);
+ if (!page)
+ continue;
+ del_page_from_free_list(page, zone, current_order);
+ expand(zone, page, order, current_order, migratetype);
+ set_pcppage_migratetype(page, migratetype);
+ return page;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * This array describes the order lists are fallen back to when
+ * the free lists for the desirable migrate type are depleted
+ */
+static int fallbacks[MIGRATE_TYPES][3] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
+#endif
+};
+
+#ifdef CONFIG_CMA
+static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
+/*
+ * Move the free pages in a range to the freelist tail of the requested type.
+ * Note that start_page and end_pages are not aligned on a pageblock
+ * boundary. If alignment is required, use move_freepages_block()
+ */
+static int move_freepages(struct zone *zone,
+ struct page *start_page, struct page *end_page,
+ int migratetype, int *num_movable)
+{
+ struct page *page;
+ unsigned int order;
+ int pages_moved = 0;
+
+ for (page = start_page; page <= end_page;) {
+ if (!pfn_valid_within(page_to_pfn(page))) {
+ page++;
+ continue;
+ }
+
+ if (!PageBuddy(page)) {
+ /*
+ * We assume that pages that could be isolated for
+ * migration are movable. But we don't actually try
+ * isolating, as that would be expensive.
+ */
+ if (num_movable &&
+ (PageLRU(page) || __PageMovable(page)))
+ (*num_movable)++;
+
+ page++;
+ continue;
+ }
+
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
+ order = buddy_order(page);
+ move_to_free_list(page, zone, order, migratetype);
+ page += 1 << order;
+ pages_moved += 1 << order;
+ }
+
+ return pages_moved;
+}
+
+int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype, int *num_movable)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *start_page, *end_page;
+
+ if (num_movable)
+ *num_movable = 0;
+
+ start_pfn = page_to_pfn(page);
+ start_pfn = start_pfn & ~(pageblock_nr_pages-1);
+ start_page = pfn_to_page(start_pfn);
+ end_page = start_page + pageblock_nr_pages - 1;
+ end_pfn = start_pfn + pageblock_nr_pages - 1;
+
+ /* Do not cross zone boundaries */
+ if (!zone_spans_pfn(zone, start_pfn))
+ start_page = page;
+ if (!zone_spans_pfn(zone, end_pfn))
+ return 0;
+
+ return move_freepages(zone, start_page, end_page, migratetype,
+ num_movable);
+}
+
+static void change_pageblock_range(struct page *pageblock_page,
+ int start_order, int migratetype)
+{
+ int nr_pageblocks = 1 << (start_order - pageblock_order);
+
+ while (nr_pageblocks--) {
+ set_pageblock_migratetype(pageblock_page, migratetype);
+ pageblock_page += pageblock_nr_pages;
+ }
+}
+
+/*
+ * When we are falling back to another migratetype during allocation, try to
+ * steal extra free pages from the same pageblocks to satisfy further
+ * allocations, instead of polluting multiple pageblocks.
+ *
+ * If we are stealing a relatively large buddy page, it is likely there will
+ * be more free pages in the pageblock, so try to steal them all. For
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
+ */
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+static inline bool boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return false;
+ /*
+ * Don't bother in zones that are unlikely to produce results.
+ * On small machines, including kdump capture kernels running
+ * in a small area, boosting the watermark can cause an out of
+ * memory situation immediately.
+ */
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+ return false;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return false;
+
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+
+ return true;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ unsigned int alloc_flags, int start_type, bool whole_block)
+{
+ unsigned int current_order = buddy_order(page);
+ int free_pages, movable_pages, alike_pages;
+ int old_block_type;
+
+ old_block_type = get_pageblock_migratetype(page);
+
+ /*
+ * This can happen due to races and we want to prevent broken
+ * highatomic accounting.
+ */
+ if (is_migrate_highatomic(old_block_type))
+ goto single_page;
+
+ /* Take ownership for orders >= pageblock_order */
+ if (current_order >= pageblock_order) {
+ change_pageblock_range(page, current_order, start_type);
+ goto single_page;
+ }
+
+ /*
+ * Boost watermarks to increase reclaim pressure to reduce the
+ * likelihood of future fallbacks. Wake kswapd now as the node
+ * may be balanced overall and kswapd will not wake naturally.
+ */
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+
+ /* We are not allowed to try stealing from the whole block */
+ if (!whole_block)
+ goto single_page;
+
+ free_pages = move_freepages_block(zone, page, start_type,
+ &movable_pages);
+ /*
+ * Determine how many pages are compatible with our allocation.
+ * For movable allocation, it's the number of movable pages which
+ * we just obtained. For other types it's a bit more tricky.
+ */
+ if (start_type == MIGRATE_MOVABLE) {
+ alike_pages = movable_pages;
+ } else {
+ /*
+ * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+ * to MOVABLE pageblock, consider all non-movable pages as
+ * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+ * vice versa, be conservative since we can't distinguish the
+ * exact migratetype of non-movable pages.
+ */
+ if (old_block_type == MIGRATE_MOVABLE)
+ alike_pages = pageblock_nr_pages
+ - (free_pages + movable_pages);
+ else
+ alike_pages = 0;
+ }
+
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
+
+ /*
+ * If a sufficient number of pages in the block are either free or of
+ * comparable migratability as our allocation, claim the whole block.
+ */
+ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+
+ return;
+
+single_page:
+ move_to_free_list(page, zone, current_order, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_TYPES)
+ break;
+
+ if (free_area_empty(area, fallback_mt))
+ continue;
+
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
+
+ if (!only_stealable)
+ return fallback_mt;
+
+ if (*can_steal)
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * Reserve a pageblock for exclusive use of high-order atomic allocations if
+ * there are no empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+ unsigned int alloc_order)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+ * Check is race-prone but harmless.
+ */
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+ && !is_migrate_cma(mt)) {
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve a pageblock even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ bool ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ /*
+ * In page freeing path, migratetype change is racy so
+ * we can counter several free pages in a pageblock
+ * in this loop althoug we changed the pageblock type
+ * from highatomic to ac->migratetype. So we should
+ * adjust the count once.
+ */
+ if (is_migrate_highatomic_page(page)) {
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ zone->nr_reserved_highatomic -= min(
+ pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+ }
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ set_pageblock_migratetype(page, ac->migratetype);
+ ret = move_freepages_block(zone, page, ac->migratetype,
+ NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
+}
+
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
+ */
+static __always_inline bool
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ unsigned int alloc_flags)
+{
+ struct free_area *area;
+ int current_order;
+ int min_order = order;
+ struct page *page;
+ int fallback_mt;
+ bool can_steal;
+
+ /*
+ * Do not steal pages from freelists belonging to other pageblocks
+ * i.e. orders < pageblock_order. If there are no local zones free,
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+ */
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ min_order = pageblock_order;
+
+ /*
+ * Find the largest available free page in the other list. This roughly
+ * approximates finding the pageblock with the most free pages, which
+ * would be too costly to do exactly.
+ */
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
+ --current_order) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
+
+ /*
+ * We cannot steal all free pages from the pageblock and the
+ * requested migratetype is movable. In that case it's better to
+ * steal and split the smallest available page instead of the
+ * largest available page, because even if the next movable
+ * allocation falls back into a different pageblock than this
+ * one, it won't cause permanent fragmentation.
+ */
+ if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+ && current_order > order)
+ goto find_smallest;
+
+ goto do_steal;
+ }
+
+ return false;
+
+find_smallest:
+ for (current_order = order; current_order < MAX_ORDER;
+ current_order++) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt != -1)
+ break;
+ }
+
+ /*
+ * This should not happen - we already found a suitable fallback
+ * when looking for the largest page.
+ */
+ VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+ page = get_page_from_free_area(area, fallback_mt);
+
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+ can_steal);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+
+ return true;
+
+}
+
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static __always_inline struct page *
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ unsigned int alloc_flags)
+{
+ struct page *page;
+
+ if (IS_ENABLED(CONFIG_CMA)) {
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ if (alloc_flags & ALLOC_CMA &&
+ zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ page = __rmqueue_cma_fallback(zone, order);
+ if (page)
+ goto out;
+ }
+ }
+retry:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (unlikely(!page)) {
+ if (alloc_flags & ALLOC_CMA)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page && __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags))
+ goto retry;
+ }
+out:
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ return page;
+}
+
+/*
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency. Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ unsigned long count, struct list_head *list,
+ int migratetype, unsigned int alloc_flags)
+{
+ int i, alloced = 0;
+
+ spin_lock(&zone->lock);
+ for (i = 0; i < count; ++i) {
+ struct page *page = __rmqueue(zone, order, migratetype,
+ alloc_flags);
+ if (unlikely(page == NULL))
+ break;
+
+ if (unlikely(check_pcp_refill(page)))
+ continue;
+
+ /*
+ * Split buddy pages returned by expand() are received here in
+ * physical page order. The page is added to the tail of
+ * caller's list. From the callers perspective, the linked list
+ * is ordered by page number under some conditions. This is
+ * useful for IO devices that can forward direction from the
+ * head, thus also in the physical page order. This is useful
+ * for IO devices that can merge IO requests if the physical
+ * pages are ordered properly.
+ */
+ list_add_tail(&page->lru, list);
+ alloced++;
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ -(1 << order));
+ }
+
+ /*
+ * i pages were removed from the buddy list even if some leak due
+ * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+ * on i. Do not confuse with 'alloced' which is the number of
+ * pages added to the pcp list.
+ */
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+ spin_unlock(&zone->lock);
+ return alloced;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
+ */
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
+{
+ unsigned long flags;
+ int to_drain, batch;
+
+ local_irq_save(flags);
+ batch = READ_ONCE(pcp->batch);
+ to_drain = min(pcp->count, batch);
+ if (to_drain > 0)
+ free_pcppages_bulk(zone, to_drain, pcp);
+ local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Drain pcplists of the indicated processor and zone.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages_zone(unsigned int cpu, struct zone *zone)
+{
+ unsigned long flags;
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);
+
+ pcp = &pset->pcp;
+ if (pcp->count)
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ local_irq_restore(flags);
+}
+
+/*
+ * Drain pcplists of all zones on the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ drain_pages_zone(cpu, zone);
+ }
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ *
+ * The CPU has to be pinned. When zone parameter is non-NULL, spill just
+ * the single zone's pages.
+ */
+void drain_local_pages(struct zone *zone)
+{
+ int cpu = smp_processor_id();
+
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
+}
+
+static void drain_local_pages_wq(struct work_struct *work)
+{
+ struct pcpu_drain *drain;
+
+ drain = container_of(work, struct pcpu_drain, work);
+
+ /*
+ * drain_all_pages doesn't use proper cpu hotplug protection so
+ * we can race with cpu offline when the WQ can move this from
+ * a cpu pinned worker to an unbound one. We can operate on a different
+ * cpu which is allright but we also have to make sure to not move to
+ * a different one.
+ */
+ preempt_disable();
+ drain_local_pages(drain->zone);
+ preempt_enable();
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+ int cpu;
+
+ /*
+ * Allocate in the BSS so we wont require allocation in
+ * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+ */
+ static cpumask_t cpus_with_pcps;
+
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON_ONCE(!mm_percpu_wq))
+ return;
+
+ /*
+ * Do not drain if one is already in progress unless it's specific to
+ * a zone. Such callers are primarily CMA and memory hotplug and need
+ * the drain to be complete when the call returns.
+ */
+ if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+ if (!zone)
+ return;
+ mutex_lock(&pcpu_drain_mutex);
+ }
+
+ /*
+ * We don't care about racing with CPU hotplug event
+ * as offline notification will cause the notified
+ * cpu to drain that CPU pcps and on_each_cpu_mask
+ * disables preemption as part of its processing
+ */
+ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pcp;
+ struct zone *z;
+ bool has_pcps = false;
+
+ if (zone) {
+ pcp = per_cpu_ptr(zone->pageset, cpu);
+ if (pcp->pcp.count)
+ has_pcps = true;
+ } else {
+ for_each_populated_zone(z) {
+ pcp = per_cpu_ptr(z->pageset, cpu);
+ if (pcp->pcp.count) {
+ has_pcps = true;
+ break;
+ }
+ }
+ }
+
+ if (has_pcps)
+ cpumask_set_cpu(cpu, &cpus_with_pcps);
+ else
+ cpumask_clear_cpu(cpu, &cpus_with_pcps);
+ }
+
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+
+ drain->zone = zone;
+ INIT_WORK(&drain->work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
+ }
+ for_each_cpu(cpu, &cpus_with_pcps)
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
+
+ mutex_unlock(&pcpu_drain_mutex);
+}
+
+#ifdef CONFIG_HIBERNATION
+
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
+void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
+ unsigned long flags;
+ unsigned int order, t;
+ struct page *page;
+
+ if (zone_is_empty(zone))
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], lru) {
+ unsigned long i;
+
+ pfn = page_to_pfn(page);
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif /* CONFIG_PM */
+
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+{
+ int migratetype;
+
+ if (!free_pcp_prepare(page))
+ return false;
+
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ set_pcppage_migratetype(page, migratetype);
+ return true;
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ int migratetype;
+
+ migratetype = get_pcppage_migratetype(page);
+ __count_vm_event(PGFREE);
+
+ /*
+ * We only track unmovable, reclaimable and movable on pcp lists.
+ * Free ISOLATE pages back to the allocator because they are being
+ * offlined but treat HIGHATOMIC as movable pages so we can get those
+ * areas back if necessary. Otherwise, we may have to free
+ * excessively into the page allocator
+ */
+ if (migratetype >= MIGRATE_PCPTYPES) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ free_one_page(zone, page, pfn, 0, migratetype,
+ FPI_NONE);
+ return;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list_add(&page->lru, &pcp->lists[migratetype]);
+ pcp->count++;
+ if (pcp->count >= pcp->high) {
+ unsigned long batch = READ_ONCE(pcp->batch);
+ free_pcppages_bulk(zone, batch, pcp);
+ }
+}
+
+/*
+ * Free a 0-order page
+ */
+void free_unref_page(struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_unref_page_prepare(page, pfn))
+ return;
+
+ local_irq_save(flags);
+ free_unref_page_commit(page, pfn);
+ local_irq_restore(flags);
+}
+
+/*
+ * Free a list of 0-order pages
+ */
+void free_unref_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+ unsigned long flags, pfn;
+ int batch_count = 0;
+
+ /* Prepare pages for freeing */
+ list_for_each_entry_safe(page, next, list, lru) {
+ pfn = page_to_pfn(page);
+ if (!free_unref_page_prepare(page, pfn))
+ list_del(&page->lru);
+ set_page_private(page, pfn);
+ }
+
+ local_irq_save(flags);
+ list_for_each_entry_safe(page, next, list, lru) {
+ unsigned long pfn = page_private(page);
+
+ set_page_private(page, 0);
+ trace_mm_page_free_batched(page);
+ free_unref_page_commit(page, pfn);
+
+ /*
+ * Guard against excessive IRQ disabled times when we get
+ * a large list of pages to free.
+ */
+ if (++batch_count == SWAP_CLUSTER_MAX) {
+ local_irq_restore(flags);
+ batch_count = 0;
+ local_irq_save(flags);
+ }
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+ int i;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+
+ for (i = 1; i < (1 << order); i++)
+ set_page_refcounted(page + i);
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+}
+EXPORT_SYMBOL_GPL(split_page);
+
+int __isolate_free_page(struct page *page, unsigned int order)
+{
+ unsigned long watermark;
+ struct zone *zone;
+ int mt;
+
+ BUG_ON(!PageBuddy(page));
+
+ zone = page_zone(page);
+ mt = get_pageblock_migratetype(page);
+
+ if (!is_migrate_isolate(mt)) {
+ /*
+ * Obey watermarks as if the page was being allocated. We can
+ * emulate a high-order watermark check with a raised order-0
+ * watermark, because we already know our high-order page
+ * exists.
+ */
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ return 0;
+
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+ }
+
+ /* Remove page from free list */
+
+ del_page_from_free_list(page, zone, order);
+
+ /*
+ * Set the pageblock if the isolated page is at least half of a
+ * pageblock
+ */
+ if (order >= pageblock_order - 1) {
+ struct page *endpage = page + (1 << order) - 1;
+ for (; page < endpage; page += pageblock_nr_pages) {
+ int mt = get_pageblock_migratetype(page);
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+ && !is_migrate_highatomic(mt))
+ set_pageblock_migratetype(page,
+ MIGRATE_MOVABLE);
+ }
+ }
+
+
+ return 1UL << order;
+}
+
+/**
+ * __putback_isolated_page - Return a now-isolated page back where we got it
+ * @page: Page that was isolated
+ * @order: Order of the isolated page
+ * @mt: The page's pageblock's migratetype
+ *
+ * This function is meant to return a page pulled from the free lists via
+ * __isolate_free_page back to the free lists they were pulled from.
+ */
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
+{
+ struct zone *zone = page_zone(page);
+
+ /* zone lock should be held when this function is called */
+ lockdep_assert_held(&zone->lock);
+
+ /* Return isolated page to tail of freelist. */
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
+}
+
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+ enum numa_stat_item local_stat = NUMA_LOCAL;
+
+ /* skip numa counters update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return;
+
+ if (zone_to_nid(z) != numa_node_id())
+ local_stat = NUMA_OTHER;
+
+ if (zone_to_nid(z) == zone_to_nid(preferred_zone))
+ __inc_numa_state(z, NUMA_HIT);
+ else {
+ __inc_numa_state(z, NUMA_MISS);
+ __inc_numa_state(preferred_zone, NUMA_FOREIGN);
+ }
+ __inc_numa_state(z, local_stat);
+#endif
+}
+
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ unsigned int alloc_flags,
+ struct per_cpu_pages *pcp,
+ struct list_head *list)
+{
+ struct page *page;
+
+ do {
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, alloc_flags);
+ if (unlikely(list_empty(list)))
+ return NULL;
+ }
+
+ page = list_first_entry(list, struct page, lru);
+ list_del(&page->lru);
+ pcp->count--;
+ } while (check_new_pcp(page));
+
+ return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
+{
+ struct per_cpu_pages *pcp;
+ struct list_head *list;
+ struct page *page;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
+ if (page) {
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
+ zone_statistics(preferred_zone, zone);
+ }
+ local_irq_restore(flags);
+ return page;
+}
+
+/*
+ * Allocate a page from the given zone. Use pcplists for order-0 allocations.
+ */
+static inline
+struct page *rmqueue(struct zone *preferred_zone,
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, unsigned int alloc_flags,
+ int migratetype)
+{
+ unsigned long flags;
+ struct page *page;
+
+ if (likely(order == 0)) {
+ /*
+ * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+ * we need to skip it when CMA area isn't allowed.
+ */
+ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+ migratetype != MIGRATE_MOVABLE) {
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
+ goto out;
+ }
+ }
+
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+ spin_lock_irqsave(&zone->lock, flags);
+
+ do {
+ page = NULL;
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+ } while (page && check_new_pages(page, order));
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone);
+ local_irq_restore(flags);
+
+out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
+ VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
+ return page;
+
+failed:
+ local_irq_restore(flags);
+ return NULL;
+}
+
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_reclaim;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .ignore_gfp_highmem = true,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
+
+ return 0;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return false;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+
+static inline long __zone_watermark_unusable_free(struct zone *z,
+ unsigned int order, unsigned int alloc_flags)
+{
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+ long unusable_free = (1 << order) - 1;
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ return unusable_free;
+}
+
+/*
+ * Return true if free base pages are above 'mark'. For high-order checks it
+ * will return true of the order-0 watermark is reached and there is at least
+ * one free page of a suitable size. Checking now avoids taking the zone lock
+ * to check in the allocation paths if no pages are free.
+ */
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int highest_zoneidx, unsigned int alloc_flags,
+ long free_pages)
+{
+ long min = mark;
+ int o;
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+
+ /* free_pages may go negative - that's OK */
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
+
+ if (alloc_flags & ALLOC_HIGH)
+ min -= min / 2;
+
+ if (unlikely(alloc_harder)) {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
+ /*
+ * Check watermarks for an order-0 allocation request. If these
+ * are not met, then a high-order request also cannot go ahead
+ * even if a suitable page happened to be free.
+ */
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
+ return false;
+
+ /* If this is an order-0 request then the watermark is fine */
+ if (!order)
+ return true;
+
+ /* For a high-order request, check at least one suitable page is free */
+ for (o = order; o < MAX_ORDER; o++) {
+ struct free_area *area = &z->free_area[o];
+ int mt;
+
+ if (!area->nr_free)
+ continue;
+
+ for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+ if (!free_area_empty(area, mt))
+ return true;
+ }
+
+#ifdef CONFIG_CMA
+ if ((alloc_flags & ALLOC_CMA) &&
+ !free_area_empty(area, MIGRATE_CMA)) {
+ return true;
+ }
+#endif
+ if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
+ return true;
+ }
+ return false;
+}
+
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int highest_zoneidx, unsigned int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+ unsigned long mark, int highest_zoneidx,
+ unsigned int alloc_flags, gfp_t gfp_mask)
+{
+ long free_pages;
+
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ /*
+ * Fast check for order-0 only. If this fails then the reserves
+ * need to be calculated.
+ */
+ if (!order) {
+ long usable_free;
+ long reserved;
+
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
+
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
+ return true;
+ }
+
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+ free_pages))
+ return true;
+ /*
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * when checking the min watermark. The min watermark is the
+ * point where boosting is ignored so that kswapd is woken up
+ * when below the low watermark.
+ */
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+ mark = z->_watermark[WMARK_MIN];
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
+ alloc_flags, free_pages);
+ }
+
+ return false;
+}
+
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int highest_zoneidx)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
+ free_pages);
+}
+
+#ifdef CONFIG_NUMA
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
+ node_reclaim_distance;
+}
+#else /* CONFIG_NUMA */
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+ unsigned int alloc_flags;
+
+ /*
+ * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
+ * to save a branch.
+ */
+ alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+
+#ifdef CONFIG_ZONE_DMA32
+ if (!zone)
+ return alloc_flags;
+
+ if (zone_idx(zone) != ZONE_NORMAL)
+ return alloc_flags;
+
+ /*
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+ * on UMA that if Normal is populated then so is DMA32.
+ */
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
+ return alloc_flags;
+
+ alloc_flags |= ALLOC_NOFRAGMENT;
+#endif /* CONFIG_ZONE_DMA32 */
+ return alloc_flags;
+}
+
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
+ unsigned int alloc_flags)
+{
+#ifdef CONFIG_CMA
+ unsigned int pflags = current->flags;
+
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
+#endif
+ return alloc_flags;
+}
+
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+ const struct alloc_context *ac)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool no_fallback;
+
+retry:
+ /*
+ * Scan zonelist, looking for a zone with enough free.
+ * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
+ */
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+ z = ac->preferred_zoneref;
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
+ ac->nodemask) {
+ struct page *page;
+ unsigned long mark;
+
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a node that is within its dirty
+ * limit, such that no single node holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the node's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-node dirty limit in the slowpath
+ * (spread_dirty_pages unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * nodes are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of nodes in the
+ * dirty-throttling and the flusher threads.
+ */
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
+
+ if (no_fallback && nr_online_nodes > 1 &&
+ zone != ac->preferred_zoneref->zone) {
+ int local_nid;
+
+ /*
+ * If moving to a remote node, retry but allow
+ * fragmenting fallbacks. Locality is more important
+ * than fragmentation avoidance.
+ */
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ if (zone_to_nid(zone) != local_nid) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ if (!zone_watermark_fast(zone, order, mark,
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask)) {
+ int ret;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * Watermark failed for this zone, but see if we can
+ * grow this zone if it contains deferred pages.
+ */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
+ /* Checked here to keep the fast path fast */
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ goto try_this_zone;
+
+ if (node_reclaim_mode == 0 ||
+ !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
+ continue;
+
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
+ switch (ret) {
+ case NODE_RECLAIM_NOSCAN:
+ /* did not scan */
+ continue;
+ case NODE_RECLAIM_FULL:
+ /* scanned but unreclaimable */
+ continue;
+ default:
+ /* did we reclaim enough */
+ if (zone_watermark_ok(zone, order, mark,
+ ac->highest_zoneidx, alloc_flags))
+ goto try_this_zone;
+
+ continue;
+ }
+ }
+
+try_this_zone:
+ page = rmqueue(ac->preferred_zoneref->zone, zone, order,
+ gfp_mask, alloc_flags, ac->migratetype);
+ if (page) {
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /*
+ * If this is a high-order atomic allocation then check
+ * if the pageblock should be reserved for the future
+ */
+ if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+ reserve_highatomic_pageblock(page, zone, order);
+
+ return page;
+ } else {
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /* Try again if zone has deferred pages */
+ if (static_branch_unlikely(&deferred_pages)) {
+ if (_deferred_grow_zone(zone, order))
+ goto try_this_zone;
+ }
+#endif
+ }
+ }
+
+ /*
+ * It's possible on a UMA machine to get through all zones that are
+ * fragmented. If avoiding fragmentation, reset and try again.
+ */
+ if (no_fallback) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+
+ return NULL;
+}
+
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
+{
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (tsk_is_oom_victim(current) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ show_mem(filter, nodemask);
+}
+
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
+
+ if ((gfp_mask & __GFP_NOWARN) ||
+ !__ratelimit(&nopage_rs) ||
+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
+ current->comm, &vaf, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
+ va_end(args);
+
+ cpuset_print_current_mems_allowed();
+ pr_cont("\n");
+ dump_stack();
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags,
+ const struct alloc_context *ac)
+{
+ struct page *page;
+
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags, ac);
+
+ return page;
+}
+
+static inline struct page *
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac, unsigned long *did_some_progress)
+{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .memcg = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
+ struct page *page;
+
+ *did_some_progress = 0;
+
+ /*
+ * Acquire the oom lock. If that fails, somebody else is
+ * making progress for us.
+ */
+ if (!mutex_trylock(&oom_lock)) {
+ *did_some_progress = 1;
+ schedule_timeout_uninterruptible(1);
+ return NULL;
+ }
+
+ /*
+ * Go through the zonelist yet one more time, keep very high watermark
+ * here, this is only to catch a parallel oom killing, we must fail if
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
+ */
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ if (page)
+ goto out;
+
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ *
+ * The OOM killer may not free memory on a specific node.
+ */
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
+ goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (ac->highest_zoneidx < ZONE_NORMAL)
+ goto out;
+ if (pm_suspended_storage())
+ goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
+ /* Exhausted what can be done so it's blame time */
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ *did_some_progress = 1;
+
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves
+ */
+ if (gfp_mask & __GFP_NOFAIL)
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ }
+out:
+ mutex_unlock(&oom_lock);
+ return page;
+}
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum compact_priority prio, enum compact_result *compact_result)
+{
+ struct page *page = NULL;
+ unsigned long pflags;
+ unsigned int noreclaim_flag;
+
+ if (!order)
+ return NULL;
+
+ psi_memstall_enter(&pflags);
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ prio, &page);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
+
+ /*
+ * At least in one zone compaction wasn't deferred or skipped, so let's
+ * count a compaction stall
+ */
+ count_vm_event(COMPACTSTALL);
+
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+
+ if (page) {
+ struct zone *zone = page_zone(page);
+
+ zone->compact_blockskip_flush = false;
+ compaction_defer_reset(zone, order, true);
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+
+ /*
+ * It's bad if compaction run occurs and fails. The most likely reason
+ * is that pages exist, but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+
+ cond_resched();
+
+ return NULL;
+}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
+ int *compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+ int min_priority;
+ bool ret = false;
+ int retries = *compaction_retries;
+ enum compact_priority priority = *compact_priority;
+
+ if (!order)
+ return false;
+
+ if (compaction_made_progress(compact_result))
+ (*compaction_retries)++;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by insufficient priority
+ */
+ if (compaction_failed(compact_result))
+ goto check_priority;
+
+ /*
+ * compaction was skipped because there are not enough order-0 pages
+ * to work with, so we retry only if it looks like reclaim can help.
+ */
+ if (compaction_needs_reclaim(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But the next retry should use a higher priority if allowed, so
+ * we don't just keep bailing out endlessly.
+ */
+ if (compaction_withdrawn(compact_result)) {
+ goto check_priority;
+ }
+
+ /*
+ * !costly requests are much more important than __GFP_RETRY_MAYFAIL
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (*compaction_retries <= max_retries) {
+ ret = true;
+ goto out;
+ }
+
+ /*
+ * Make sure there are attempts at the highest priority if we exhausted
+ * all retries or failed at the lower priorities.
+ */
+check_priority:
+ min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
+ MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
+ if (*compact_priority > min_priority) {
+ (*compact_priority)--;
+ *compaction_retries = 0;
+ ret = true;
+ }
+out:
+ trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+ return ret;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ enum compact_priority prio, enum compact_result *compact_result)
+{
+ *compact_result = COMPACT_SKIPPED;
+ return NULL;
+}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
+ int *compaction_retries)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * There are setups with compaction disabled which would prefer to loop
+ * inside the allocator rather than hit the oom killer prematurely.
+ * Let's give them a good hope and keep retrying while the order-0
+ * watermarks are OK.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
+ if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
+ ac->highest_zoneidx, alloc_flags))
+ return true;
+ }
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if (current->flags & PF_MEMALLOC)
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void __fs_reclaim_acquire(void)
+{
+ lock_map_acquire(&__fs_reclaim_map);
+}
+
+void __fs_reclaim_release(void)
+{
+ lock_map_release(&__fs_reclaim_map);
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ __fs_reclaim_acquire();
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ __fs_reclaim_release();
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
+/* Perform direct synchronous page reclaim */
+static unsigned long
+__perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac)
+{
+ unsigned int noreclaim_flag;
+ unsigned long pflags, progress;
+
+ cond_resched();
+
+ /* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
+ fs_reclaim_acquire(gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+ ac->nodemask);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(gfp_mask);
+ psi_memstall_leave(&pflags);
+
+ cond_resched();
+
+ return progress;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags, const struct alloc_context *ac,
+ unsigned long *did_some_progress)
+{
+ struct page *page = NULL;
+ bool drained = false;
+
+ *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
+ if (unlikely(!(*did_some_progress)))
+ return NULL;
+
+retry:
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists or in high alloc reserves.
+ * Shrink them and try again
+ */
+ if (!page && !drained) {
+ unreserve_highatomic_pageblock(ac, false);
+ drain_all_pages(NULL);
+ drained = true;
+ goto retry;
+ }
+
+ return page;
+}
+
+static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+ const struct alloc_context *ac)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ pg_data_t *last_pgdat = NULL;
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
+
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
+ ac->nodemask) {
+ if (last_pgdat != zone->zone_pgdat)
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
+ }
+}
+
+static inline unsigned int
+gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+ unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+
+ /*
+ * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
+ * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
+ * to save two branches.
+ */
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
+
+ /*
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or if the caller has realtime scheduling
+ * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
+ * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ */
+ alloc_flags |= (__force int)
+ (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
+
+ if (gfp_mask & __GFP_ATOMIC) {
+ /*
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
+ /*
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed().
+ */
+ alloc_flags &= ~ALLOC_CPUSET;
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
+ alloc_flags |= ALLOC_HARDER;
+
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+
+ return alloc_flags;
+}
+
+static bool oom_reserves_allowed(struct task_struct *tsk)
+{
+ if (!tsk_is_oom_victim(tsk))
+ return false;
+
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+ return false;
+
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
+ if (gfp_mask & __GFP_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
+
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
+}
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+ struct alloc_context *ac, int alloc_flags,
+ bool did_some_progress, int *no_progress_loops)
+{
+ struct zone *zone;
+ struct zoneref *z;
+ bool ret = false;
+
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ *no_progress_loops = 0;
+ else
+ (*no_progress_loops)++;
+
+ /*
+ * Make sure we converge to OOM if we cannot make any progress
+ * several times in the row.
+ */
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
+ /* Before OOM, exhaust highatomic_reserve */
+ return unreserve_highatomic_pageblock(ac, true);
+ }
+
+ /*
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
+ unsigned long available;
+ unsigned long reclaimable;
+ unsigned long min_wmark = min_wmark_pages(zone);
+ bool wmark;
+
+ available = reclaimable = zone_reclaimable_pages(zone);
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ /*
+ * Would the allocation succeed if we reclaimed all
+ * reclaimable pages?
+ */
+ wmark = __zone_watermark_ok(zone, order, min_wmark,
+ ac->highest_zoneidx, alloc_flags, available);
+ trace_reclaim_retry_zone(z, order, reclaimable,
+ available, min_wmark, *no_progress_loops, wmark);
+ if (wmark) {
+ /*
+ * If we didn't make any progress and have a lot of
+ * dirty + writeback pages then we should wait for
+ * an IO to complete to slow down the reclaim and
+ * prevent from pre mature OOM
+ */
+ if (!did_some_progress) {
+ unsigned long write_pending;
+
+ write_pending = zone_page_state_snapshot(zone,
+ NR_ZONE_WRITE_PENDING);
+
+ if (2 * write_pending > reclaimable) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ return true;
+ }
+ }
+
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
+}
+
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+ /*
+ * It's possible that cpuset's mems_allowed and the nodemask from
+ * mempolicy don't intersect. This should be normally dealt with by
+ * policy_nodemask(), but it's possible to race with cpuset update in
+ * such a way the check therein was true, and then it became false
+ * before we got our cpuset_mems_cookie here.
+ * This assumes that for all allocations, ac->nodemask can come only
+ * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+ * when it does not intersect with the cpuset restrictions) or the
+ * caller can deal with a violated nodemask.
+ */
+ if (cpusets_enabled() && ac->nodemask &&
+ !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+ ac->nodemask = NULL;
+ return true;
+ }
+
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ return true;
+
+ return false;
+}
+
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+ struct alloc_context *ac)
+{
+ bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
+ struct page *page = NULL;
+ unsigned int alloc_flags;
+ unsigned long did_some_progress;
+ enum compact_priority compact_priority;
+ enum compact_result compact_result;
+ int compaction_retries;
+ int no_progress_loops;
+ unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
+ int reserve_flags;
+
+ /*
+ * We also sanity check to catch abuse of atomic reserves being used by
+ * callers that are not in atomic context.
+ */
+ if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+ (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+ gfp_mask &= ~__GFP_ATOMIC;
+
+restart:
+ compaction_retries = 0;
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
+
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+ * alloc_flags precisely. So we do that now.
+ */
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
+ /*
+ * We need to recalculate the starting point for the zonelist iterator
+ * because we might have used different nodemask in the fast path, or
+ * there was a cpuset modification and we are retrying - otherwise we
+ * could end up iterating over non-eligible zones endlessly.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask);
+ if (!ac->preferred_zoneref->zone)
+ goto nopage;
+
+ if (alloc_flags & ALLOC_KSWAPD)
+ wake_all_kswapds(order, gfp_mask, ac);
+
+ /*
+ * The adjusted alloc_flags might result in immediate success, so try
+ * that first
+ */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. For non-
+ * movable high-order allocations, do that as well, as compaction will
+ * try prevent permanent fragmentation by migrating from blocks of the
+ * same migratetype.
+ * Don't try this for allocations that are allowed to ignore
+ * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ */
+ if (can_direct_reclaim &&
+ (costly_order ||
+ (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+ && !gfp_pfmemalloc_allowed(gfp_mask)) {
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac,
+ INIT_COMPACT_PRIORITY,
+ &compact_result);
+ if (page)
+ goto got_pg;
+
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes some THP page fault allocations
+ */
+ if (costly_order && (gfp_mask & __GFP_NORETRY)) {
+ /*
+ * If allocating entire pageblock(s) and compaction
+ * failed because all zones are below low watermarks
+ * or is prohibited because it recently failed at this
+ * order, fail immediately unless the allocator has
+ * requested compaction and reclaim retry.
+ *
+ * Reclaim is
+ * - potentially very expensive because zones are far
+ * below their low watermarks or this is part of very
+ * bursty high order allocations,
+ * - not guaranteed to help because isolate_freepages()
+ * may not iterate over freed pages as part of its
+ * linear scan, and
+ * - unlikely to make entire pageblocks free on its
+ * own.
+ */
+ if (compact_result == COMPACT_SKIPPED ||
+ compact_result == COMPACT_DEFERRED)
+ goto nopage;
+
+ /*
+ * Looks like reclaim/compaction is worth trying, but
+ * sync compaction could be very expensive, so keep
+ * using async compaction.
+ */
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+ }
+
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+ if (alloc_flags & ALLOC_KSWAPD)
+ wake_all_kswapds(order, gfp_mask, ac);
+
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+
+ /*
+ * Reset the nodemask and zonelist iterators if memory policies can be
+ * ignored. These allocations are high priority and system rather than
+ * user oriented.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
+ ac->nodemask = NULL;
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask);
+ }
+
+ /* Attempt with potentially adjusted zonelist and alloc_flags */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /* Caller is not willing to reclaim, we can't balance anything */
+ if (!can_direct_reclaim)
+ goto nopage;
+
+ /* Avoid recursion of direct reclaim */
+ if (current->flags & PF_MEMALLOC)
+ goto nopage;
+
+ /* Try direct reclaim and then allocating */
+ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
+ &did_some_progress);
+ if (page)
+ goto got_pg;
+
+ /* Try direct compaction and then allocating */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ compact_priority, &compact_result);
+ if (page)
+ goto got_pg;
+
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ goto nopage;
+
+ /*
+ * Do not retry costly high order allocations unless they are
+ * __GFP_RETRY_MAYFAIL
+ */
+ if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ goto nopage;
+
+ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+ did_some_progress > 0, &no_progress_loops))
+ goto retry;
+
+ /*
+ * It doesn't make any sense to retry for the compaction if the order-0
+ * reclaim is not able to make any progress because the current
+ * implementation of the compaction depends on the sufficient amount
+ * of free memory (see __compaction_suitable)
+ */
+ if (did_some_progress > 0 &&
+ should_compact_retry(ac, order, alloc_flags,
+ compact_result, &compact_priority,
+ &compaction_retries))
+ goto retry;
+
+
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
+ /* Reclaim has failed us, start killing things */
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ if (page)
+ goto got_pg;
+
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags & ALLOC_OOM ||
+ (gfp_mask & __GFP_NOMEMALLOC)))
+ goto nopage;
+
+ /* Retry as long as the OOM killer is making progress */
+ if (did_some_progress) {
+ no_progress_loops = 0;
+ goto retry;
+ }
+
+nopage:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
+ /*
+ * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+ * we always retry
+ */
+ if (gfp_mask & __GFP_NOFAIL) {
+ /*
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually require GFP_NOWAIT
+ */
+ if (WARN_ON_ONCE(!can_direct_reclaim))
+ goto fail;
+
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+ /*
+ * non failing costly orders are a hard requirement which we
+ * are not prepared for much so let's warn about these users
+ * so that we can identify them and convert them to something
+ * else.
+ */
+ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * could deplete whole memory reserves which would just make
+ * the situation worse
+ */
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ if (page)
+ goto got_pg;
+
+ cond_resched();
+ goto retry;
+ }
+fail:
+ warn_alloc(gfp_mask, ac->nodemask,
+ "page allocation failure: order:%u", order);
+got_pg:
+ return page;
+}
+
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask,
+ struct alloc_context *ac, gfp_t *alloc_mask,
+ unsigned int *alloc_flags)
+{
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
+ ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
+ ac->nodemask = nodemask;
+ ac->migratetype = gfp_migratetype(gfp_mask);
+
+ if (cpusets_enabled()) {
+ *alloc_mask |= __GFP_HARDWALL;
+ /*
+ * When we are in the interrupt context, it is irrelevant
+ * to the current task context. It means that any node ok.
+ */
+ if (!in_interrupt() && !ac->nodemask)
+ ac->nodemask = &cpuset_current_mems_allowed;
+ else
+ *alloc_flags |= ALLOC_CPUSET;
+ }
+
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
+
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+
+ if (should_fail_alloc_page(gfp_mask, order))
+ return false;
+
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+
+ /* Dirty zone balancing only done in the fast path */
+ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
+ /*
+ * The preferred zone is used for statistics but crucially it is
+ * also used as the starting point for the zonelist iterator. It
+ * may get reset for allocations that ignore memory policies.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask);
+
+ return true;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
+{
+ struct page *page;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = { };
+
+ /*
+ * There are several places where we assume that the order value is sane
+ * so bail out early if the request is out of bound.
+ */
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
+ gfp_mask &= gfp_allowed_mask;
+ alloc_mask = gfp_mask;
+ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
+ return NULL;
+
+ /*
+ * Forbid the first pass from falling back to types that fragment
+ * memory until all local zones are considered.
+ */
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+
+ /* First allocation attempt */
+ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+ if (likely(page))
+ goto out;
+
+ /*
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}.
+ */
+ alloc_mask = current_gfp_context(gfp_mask);
+ ac.spread_dirty_pages = false;
+
+ /*
+ * Restore the original nodemask if it was potentially replaced with
+ * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+ */
+ ac.nodemask = nodemask;
+
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+
+out:
+ if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+ unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
+ __free_pages(page, order);
+ page = NULL;
+ }
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+
+ return page;
+}
+EXPORT_SYMBOL(__alloc_pages_nodemask);
+
+/*
+ * Common helper functions. Never use with __GFP_HIGHMEM because the returned
+ * address cannot represent highmem pages. Use alloc_pages and then kmap if
+ * you need to access high mem.
+ */
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
+}
+EXPORT_SYMBOL(__get_free_pages);
+
+unsigned long get_zeroed_page(gfp_t gfp_mask)
+{
+ return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
+}
+EXPORT_SYMBOL(get_zeroed_page);
+
+static inline void free_the_page(struct page *page, unsigned int order)
+{
+ if (order == 0) /* Via pcp? */
+ free_unref_page(page);
+ else
+ __free_pages_ok(page, order, FPI_NONE);
+}
+
+void __free_pages(struct page *page, unsigned int order)
+{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+
+ if (put_page_testzero(page))
+ free_the_page(page, order);
+ else if (!head)
+ while (order-- > 0)
+ free_the_page(page + (1 << order), order);
+}
+EXPORT_SYMBOL(__free_pages);
+
+void free_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_pages(virt_to_page((void *)addr), order);
+ }
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Page Fragment:
+ * An arbitrary-length arbitrary-offset area of memory which resides
+ * within a 0 or higher order page. Multiple fragments within that page
+ * are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments. This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
+{
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+ PAGE_FRAG_CACHE_MAX_ORDER);
+ nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->va = page ? page_address(page) : NULL;
+
+ return page;
+}
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count))
+ free_the_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned int size = PAGE_SIZE;
+ struct page *page;
+ int offset;
+
+ if (unlikely(!nc->va)) {
+refill:
+ page = __page_frag_cache_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pfmemalloc = page_is_pfmemalloc(page);
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ nc->offset = size;
+ }
+
+ offset = nc->offset - fragsz;
+ if (unlikely(offset < 0)) {
+ page = virt_to_page(nc->va);
+
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+ goto refill;
+
+ if (unlikely(nc->pfmemalloc)) {
+ free_the_page(page, compound_order(page));
+ goto refill;
+ }
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* OK, page count is 0, we can safely set it */
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
+ }
+
+ nc->pagecnt_bias--;
+ nc->offset = offset;
+
+ return nc->va + offset;
+}
+EXPORT_SYMBOL(page_frag_alloc);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+ struct page *page = virt_to_head_page(addr);
+
+ if (unlikely(put_page_testzero(page)))
+ free_the_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
+
+static void *make_alloc_exact(unsigned long addr, unsigned int order,
+ size_t size)
+{
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page((void *)addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ return (void *)addr;
+}
+
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request. alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ unsigned long addr;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ addr = __get_free_pages(gfp_mask, order);
+ return make_alloc_exact(addr, order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+
+/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ * pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
+ */
+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
+ if (!p)
+ return NULL;
+ return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+ unsigned long addr = (unsigned long)virt;
+ unsigned long end = addr + PAGE_ALIGN(size);
+
+ while (addr < end) {
+ free_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
+EXPORT_SYMBOL(free_pages_exact);
+
+/**
+ * nr_free_zone_pages - count number of pages beyond high watermark
+ * @offset: The zone index of the highest zone
+ *
+ * nr_free_zone_pages() counts the number of pages which are beyond the
+ * high watermark within all zones at or below a given zone index. For each
+ * zone, the number of pages is calculated as:
+ *
+ * nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
+ */
+static unsigned long nr_free_zone_pages(int offset)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ /* Just pick one node, since fallback list is circular */
+ unsigned long sum = 0;
+
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
+
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
+ unsigned long size = zone_managed_pages(zone);
+ unsigned long high = high_wmark_pages(zone);
+ if (size > high)
+ sum += size - high;
+ }
+
+ return sum;
+}
+
+/**
+ * nr_free_buffer_pages - count number of pages beyond high watermark
+ *
+ * nr_free_buffer_pages() counts the number of pages which are beyond the high
+ * watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
+ */
+unsigned long nr_free_buffer_pages(void)
+{
+ return nr_free_zone_pages(gfp_zone(GFP_USER));
+}
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
+
+static inline void show_node(struct zone *zone)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += low_wmark_pages(zone);
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
+ */
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages();
+ val->sharedram = global_node_page_state(NR_SHMEM);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages();
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone_managed_pages(zone);
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#else
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
+{
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ return false;
+
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
+}
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk(KERN_CONT "(%s) ", tmp);
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
+ */
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
+{
+ unsigned long free_pcp = 0;
+ int cpu;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ for_each_populated_zone(zone) {
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+ }
+
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
+ free_pcp,
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+
+ for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+ * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
+ }
+
+ for_each_populated_zone(zone) {
+ int i;
+
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
+ show_node(zone);
+ printk(KERN_CONT
+ "%s"
+ " free:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " reserved_highatomic:%luKB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
+ " present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " pagetables:%lukB"
+ " bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
+ " free_cma:%lukB"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+ K(zone->present_pages),
+ K(zone_managed_pages(zone)),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_PAGETABLE)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
+ printk(KERN_CONT "\n");
+ }
+
+ for_each_populated_zone(zone) {
+ unsigned int order;
+ unsigned long nr[MAX_ORDER], flags, total = 0;
+ unsigned char types[MAX_ORDER];
+
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+ show_node(zone);
+ printk(KERN_CONT "%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
+ total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!free_area_empty(area, type))
+ types[order] |= 1 << type;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ printk(KERN_CONT "%lu*%lukB ",
+ nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
+ printk(KERN_CONT "= %lukB\n", K(total));
+ }
+
+ hugetlb_show_meminfo();
+
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+ zoneref->zone = zone;
+ zoneref->zone_idx = zone_idx(zone);
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
+ */
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
+{
+ struct zone *zone;
+ enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
+
+ do {
+ zone_type--;
+ zone = pgdat->node_zones + zone_type;
+ if (populated_zone(zone)) {
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
+ check_highest_zone(zone_type);
+ }
+ } while (zone_type);
+
+ return nr_zones;
+}
+
+#ifdef CONFIG_NUMA
+
+static int __parse_numa_zonelist_order(char *s)
+{
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+char numa_zonelist_order[] = "Node";
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ return __parse_numa_zonelist_order(buffer);
+ return proc_dostring(table, write, buffer, length, ppos);
+}
+
+
+#define MAX_NODE_LOAD (nr_online_nodes)
+static int node_load[MAX_NUMNODES];
+
+/**
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: nodemask_t of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list. The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
+ */
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = NUMA_NO_NODE;
+
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
+
+ for_each_node_state(n, N_MEMORY) {
+
+ /* Don't want a node to appear more than once */
+ if (node_isset(n, *used_node_mask))
+ continue;
+
+ /* Use the distance array to find the distance */
+ val = node_distance(node, n);
+
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
+ /* Give preference to headless and unused nodes */
+ if (!cpumask_empty(cpumask_of_node(n)))
+ val += PENALTY_FOR_NODE_WITH_CPUS;
+
+ /* Slight preference for less loaded node */
+ val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val += node_load[n];
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ if (best_node >= 0)
+ node_set(best_node, *used_node_mask);
+
+ return best_node;
+}
+
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
+{
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
+
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
+}
+
+/*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+ struct zoneref *zonerefs;
+ int nr_zones;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
+ nodemask_t used_mask = NODE_MASK_NONE;
+ int local_node, prev_node;
+
+ /* NUMA-aware ordering of nodes */
+ local_node = pgdat->node_id;
+ load = nr_online_nodes;
+ prev_node = local_node;
+
+ memset(node_order, 0, sizeof(node_order));
+ while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ /*
+ * We don't want to pressure a particular node.
+ * So adding penalty to the first node in same
+ * distance group to make it round-robin.
+ */
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
+ node_load[node] = load;
+
+ node_order[nr_nodes++] = node;
+ prev_node = node;
+ load--;
+ }
+
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
+ build_thisnode_zonelists(pgdat);
+}
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+ struct zoneref *z;
+
+ z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ gfp_zone(GFP_KERNEL),
+ NULL);
+ return zone_to_nid(z->zone);
+}
+#endif
+
+static void setup_min_unmapped_ratio(void);
+static void setup_min_slab_ratio(void);
+#else /* CONFIG_NUMA */
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int node, local_node;
+ struct zoneref *zonerefs;
+ int nr_zones;
+
+ local_node = pgdat->node_id;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
+ }
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
+ }
+
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
+}
+
+#endif /* CONFIG_NUMA */
+
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
+static void __build_all_zonelists(void *data)
+{
+ int nid;
+ int __maybe_unused cpu;
+ pg_data_t *self = data;
+ unsigned long flags;
+
+ /*
+ * Explicitly disable this CPU's interrupts before taking seqlock
+ * to prevent any IRQ handler from calling into the page allocator
+ * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
+ */
+ local_irq_save(flags);
+ /*
+ * Explicitly disable this CPU's synchronous printk() before taking
+ * seqlock to prevent any printk() from trying to hold port->lock, for
+ * tty_insert_flip_string_and_push_buffer() on other CPU might be
+ * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
+ */
+ printk_deferred_enter();
+ write_seqlock(&zonelist_update_seq);
+
+#ifdef CONFIG_NUMA
+ memset(node_load, 0, sizeof(node_load));
+#endif
+
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ build_zonelists(pgdat);
+ }
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+ /*
+ * We now know the "local memory node" for each node--
+ * i.e., the node of the first zone in the generic zonelist.
+ * Set up numa_mem percpu variable for on-line cpus. During
+ * boot, only the boot cpu should be on-line; we'll init the
+ * secondary cpus' numa_mem as they come on-line. During
+ * node/memory hotplug, we'll fixup all on-line cpus.
+ */
+ for_each_online_cpu(cpu)
+ set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+ }
+
+ write_sequnlock(&zonelist_update_seq);
+ printk_deferred_exit();
+ local_irq_restore(flags);
+}
+
+static noinline void __init
+build_all_zonelists_init(void)
+{
+ int cpu;
+
+ __build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
+ mminit_verify_zonelist();
+ cpuset_init_current_mems_allowed();
+}
+
+/*
+ * unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
+ */
+void __ref build_all_zonelists(pg_data_t *pgdat)
+{
+ unsigned long vm_total_pages;
+
+ if (system_state == SYSTEM_BOOTING) {
+ build_all_zonelists_init();
+ } else {
+ __build_all_zonelists(pgdat);
+ /* cpuset refresh routine should be here */
+ }
+ /* Get the number of free pages beyond high watermark in all zones. */
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
+ /*
+ * Disable grouping by mobility if the number of pages in the
+ * system is too low to allow the mechanism to work. It would be
+ * more accurate, but expensive to check per-zone. This check is
+ * made on memory-hotadd so a system can start with mobility
+ * disabled and enable it later
+ */
+ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
+ page_group_by_mobility_disabled = 1;
+ else
+ page_group_by_mobility_disabled = 0;
+
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
+#ifdef CONFIG_NUMA
+ pr_info("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
+}
+
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_mem_region(r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by memblock_free_all() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
+ */
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+ unsigned long start_pfn, unsigned long zone_end_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct page *page;
+
+ if (highest_memmap_pfn < end_pfn - 1)
+ highest_memmap_pfn = end_pfn - 1;
+
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Honor reservation requested by the driver for this ZONE_DEVICE
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
+ */
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
+
+ for (pfn = start_pfn; pfn < end_pfn; ) {
+ /*
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context == MEMINIT_EARLY) {
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, zone_end_pfn))
+ break;
+ }
+
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zone, nid);
+ if (context == MEMINIT_HOTPLUG)
+ __SetPageReserved(page);
+
+ /*
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
+ */
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
+ cond_resched();
+ }
+ pfn++;
+ }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
+ return;
+
+ /*
+ * The call to memmap_init should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (altmap) {
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ nr_pages = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->zone_device_data = NULL;
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+ * because this is done early in section_activate()
+ */
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+ }
+
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
+ nr_pages, jiffies_to_msecs(jiffies - start));
+}
+
+#endif
+static void __meminit zone_init_free_lists(struct zone *zone)
+{
+ unsigned int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
+ zone->free_area[order].nr_free = 0;
+ }
+}
+
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone_range().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ * arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ * nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ * hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ * the zone boundary; the pages in such holes will be prepended to the
+ * zone/node above the hole except for the trailing pages in the last
+ * section that will be appended to the zone/node below.
+ */
+static void __init init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node);
+ __SetPageReserved(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ if (pgcnt)
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+ node, zone_names[zone], pgcnt);
+}
+#else
+static inline void init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
+{
+}
+#endif
+
+static void __init memmap_init_zone_range(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *hole_pfn)
+{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ memmap_init_zone(end_pfn - start_pfn, nid, zone_id, start_pfn,
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (*hole_pfn < start_pfn)
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+ *hole_pfn = end_pfn;
+}
+
+void __init __weak memmap_init(void)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long hole_pfn = 0;
+ int i, j, zone_id, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ struct pglist_data *node = NODE_DATA(nid);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = node->node_zones + j;
+
+ if (!populated_zone(zone))
+ continue;
+
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
+ &hole_pfn);
+ zone_id = j;
+ }
+ }
+
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Initialize the memory map for hole in the range [memory_end,
+ * section_end].
+ * Append the pages in this hole to the highest zone in the last
+ * node.
+ * The call to init_unavailable_range() is outside the ifdef to
+ * silence the compiler warining about zone_id set but not used;
+ * for FLATMEM it is a nop anyway
+ */
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
+ if (hole_pfn < end_pfn)
+#endif
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
+}
+
+/* A stub for backwards compatibility with custom implementatin on IA-64 */
+void __meminit __weak arch_memmap_init(unsigned long size, int nid,
+ unsigned long zone,
+ unsigned long range_start_pfn)
+{
+}
+
+static int zone_batchsize(struct zone *zone)
+{
+#ifdef CONFIG_MMU
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone.
+ */
+ batch = zone_managed_pages(zone) / 1024;
+ /* But no more than a meg. */
+ if (batch * PAGE_SIZE > 1024 * 1024)
+ batch = (1024 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = rounddown_pow_of_two(batch + batch/2) - 1;
+
+ return batch;
+
+#else
+ /* The deferral and batching of frees should be suppressed under NOMMU
+ * conditions.
+ *
+ * The problem is that NOMMU needs to be able to allocate large chunks
+ * of contiguous memory as there's no hardware page translation to
+ * assemble apparent contiguous memory from discontiguous pages.
+ *
+ * Queueing large contiguous runs of pages for batching, however,
+ * causes the pages to actually be freed in smaller chunks. As there
+ * can be a significant delay between the individual batches being
+ * recycled, this leads to the once large chunks of space being
+ * fragmented and becoming unavailable for high-order allocations.
+ */
+ return 0;
+#endif
+}
+
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording to the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+ unsigned long batch)
+{
+ /* start with a fail safe value for batch */
+ pcp->batch = 1;
+ smp_wmb();
+
+ /* Update high, then batch, in order */
+ pcp->high = high;
+ smp_wmb();
+
+ pcp->batch = batch;
+}
+
+/* a companion to pageset_set_high() */
+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+ pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+}
+
+static void pageset_init(struct per_cpu_pageset *p)
+{
+ struct per_cpu_pages *pcp;
+ int migratetype;
+
+ memset(p, 0, sizeof(*p));
+
+ pcp = &p->pcp;
+ for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+ INIT_LIST_HEAD(&pcp->lists[migratetype]);
+}
+
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ pageset_init(p);
+ pageset_set_batch(p, batch);
+}
+
+/*
+ * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void pageset_set_high(struct per_cpu_pageset *p,
+ unsigned long high)
+{
+ unsigned long batch = max(1UL, high / 4);
+ if ((high / 4) > (PAGE_SHIFT * 8))
+ batch = PAGE_SHIFT * 8;
+
+ pageset_update(&p->pcp, high, batch);
+}
+
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
+{
+ if (percpu_pagelist_fraction)
+ pageset_set_high(pcp,
+ (zone_managed_pages(zone) /
+ percpu_pagelist_fraction));
+ else
+ pageset_set_batch(pcp, zone_batchsize(zone));
+}
+
+static void __meminit zone_pageset_init(struct zone *zone, int cpu)
+{
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ pageset_init(pcp);
+ pageset_set_high_and_batch(zone, pcp);
+}
+
+void __meminit setup_zone_pageset(struct zone *zone)
+{
+ int cpu;
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+ for_each_possible_cpu(cpu)
+ zone_pageset_init(zone, cpu);
+}
+
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ */
+void __init setup_per_cpu_pageset(void)
+{
+ struct pglist_data *pgdat;
+ struct zone *zone;
+ int __maybe_unused cpu;
+
+ for_each_populated_zone(zone)
+ setup_zone_pageset(zone);
+
+#ifdef CONFIG_NUMA
+ /*
+ * Unpopulated zones continue using the boot pagesets.
+ * The numa stats for these pagesets need to be reset.
+ * Otherwise, they will end up skewing the stats of
+ * the nodes these zones are associated with.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
+ memset(pcp->vm_numa_stat_diff, 0,
+ sizeof(pcp->vm_numa_stat_diff));
+ }
+#endif
+
+ for_each_online_pgdat(pgdat)
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
+}
+
+static __meminit void zone_pcp_init(struct zone *zone)
+{
+ /*
+ * per cpu subsystem is not up at this point. The following code
+ * relies on the ability of the linker to provide the
+ * offset of a (static) per cpu variable into the per cpu area.
+ */
+ zone->pageset = &boot_pageset;
+
+ if (populated_zone(zone))
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
+ zone->name, zone->present_pages,
+ zone_batchsize(zone));
+}
+
+void __meminit init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int zone_idx = zone_idx(zone) + 1;
+
+ if (zone_idx > pgdat->nr_zones)
+ pgdat->nr_zones = zone_idx;
+
+ zone->zone_start_pfn = zone_start_pfn;
+
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
+
+ zone_init_free_lists(zone);
+ zone->initialized = 1;
+}
+
+/**
+ * get_pfn_range_for_nid - Return the start and end page frames for a node
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
+ *
+ * It returns the start and end page frame of a node based on information
+ * provided by memblock_set_node(). If called for a node
+ * with no available memory, a warning is printed and the start and end
+ * PFNs will be 0.
+ */
+void __init get_pfn_range_for_nid(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ unsigned long this_start_pfn, this_end_pfn;
+ int i;
+
+ *start_pfn = -1UL;
+ *end_pfn = 0;
+
+ for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ *start_pfn = min(*start_pfn, this_start_pfn);
+ *end_pfn = max(*end_pfn, this_end_pfn);
+ }
+
+ if (*start_pfn == -1UL)
+ *start_pfn = 0;
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
+ }
+
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independent of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __init adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (!mirrored_kernelcore &&
+ *zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
+}
+
+/*
+ * Return the number of pages a zone spans in a node, including holes
+ * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
+ */
+static unsigned long __init zone_spanned_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
+ /* Get the start and end of the zone */
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ zone_start_pfn, zone_end_pfn);
+
+ /* Check that this node has pages within the zone's required range */
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
+ return 0;
+
+ /* Move the zone boundaries inside the node if necessary */
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
+
+ /* Return the spanned pages */
+ return *zone_end_pfn - *zone_start_pfn;
+}
+
+/*
+ * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * then all holes in the requested range will be accounted for.
+ */
+unsigned long __init __absent_pages_in_range(int nid,
+ unsigned long range_start_pfn,
+ unsigned long range_end_pfn)
+{
+ unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ nr_absent -= end_pfn - start_pfn;
+ }
+ return nr_absent;
+}
+
+/**
+ * absent_pages_in_range - Return number of page frames in holes within a range
+ * @start_pfn: The start PFN to start searching for holes
+ * @end_pfn: The end PFN to stop searching for holes
+ *
+ * Return: the number of pages frames in memory holes within a range.
+ */
+unsigned long __init absent_pages_in_range(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+}
+
+/* Return the number of page frames in holes in a zone on a node */
+static unsigned long __init zone_absent_pages_in_node(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn)
+{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+ unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
+
+ /* When hotadd a new node from cpu_up(), the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
+ zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_mem_region(r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ }
+
+ return nr_absent;
+}
+
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn)
+{
+ unsigned long realtotalpages = 0, totalpages = 0;
+ enum zone_type i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
+ unsigned long size, real_size;
+
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn);
+
+ size = spanned;
+ real_size = size - absent;
+
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = size;
+ zone->present_pages = real_size;
+
+ totalpages += size;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
+ pgdat->node_present_pages = realtotalpages;
+ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
+ realtotalpages);
+}
+
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
+ usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = usemapsize >> pageblock_order;
+ usemapsize *= NR_PAGEBLOCK_BITS;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __ref setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long zonesize)
+{
+ unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
+ zone->pageblock_flags = NULL;
+ if (usemapsize) {
+ zone->pageblock_flags =
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
+}
+#else
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+void __init set_pageblock_order(void)
+{
+ unsigned int order;
+
+ /* Check that pageblock_nr_pages has not already been setup */
+ if (pageblock_order)
+ return;
+
+ if (HPAGE_SHIFT > PAGE_SHIFT)
+ order = HUGETLB_PAGE_ORDER;
+ else
+ order = MAX_ORDER - 1;
+
+ /*
+ * Assume the largest contiguous order of interest is a huge page.
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
+ */
+ pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
+ */
+void __init set_pageblock_order(void)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not be naturally aligned on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
+#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ pgdat_resize_init(pgdat);
+
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
+ pgdat_page_ext_init(pgdat);
+ spin_lock_init(&pgdat->lru_lock);
+ lruvec_init(&pgdat->__lruvec);
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ atomic_long_set(&zone->managed_pages, remaining_pages);
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(int nid)
+{
+ enum zone_type z;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ pgdat_init_internals(pgdat);
+ for (z = 0; z < MAX_NR_ZONES; z++)
+ zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+
+ pgdat_init_internals(pgdat);
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ unsigned long size, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+
+ size = zone->spanned_pages;
+ freesize = zone->present_pages;
+
+ /*
+ * Adjust freesize so that it accounts for how much memory
+ * is used by this zone for memmap. This affects the watermark
+ * and per-cpu initialisations
+ */
+ memmap_pages = calc_memmap_size(size, freesize);
+ if (!is_highmem_idx(j)) {
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
+ } else
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+ }
+
+ /* Account for reserved pages */
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
+ printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
+ zone_names[0], dma_reserve);
+ }
+
+ if (!is_highmem_idx(j))
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
+
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone_init_internals(zone, j, nid, freesize);
+
+ if (!size)
+ continue;
+
+ set_pageblock_order();
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
+ arch_memmap_init(size, nid, j, zone_start_pfn);
+ }
+}
+
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
+{
+ unsigned long __maybe_unused start = 0;
+ unsigned long __maybe_unused offset = 0;
+
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ unsigned long size, end;
+ struct page *map;
+
+ /*
+ * The zone's endpoints aren't required to be MAX_ORDER
+ * aligned but the node_mem_map endpoints must be in order
+ * for the buddy allocator to function correctly.
+ */
+ end = pgdat_end_pfn(pgdat);
+ end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ size = (end - start) * sizeof(struct page);
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
+ pgdat->node_mem_map = map + offset;
+ }
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0)) {
+ mem_map = NODE_DATA(0)->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+ }
+#endif
+}
+#else
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+#endif
+
+static void __init free_area_init_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = 0;
+ unsigned long end_pfn = 0;
+
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
+
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+
+ alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
+
+ free_area_init_core(pgdat);
+}
+
+void __init free_area_init_memoryless_node(int nid)
+{
+ free_area_init_node(nid);
+}
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+void __init setup_nr_node_ids(void)
+{
+ unsigned int highest;
+
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
+ nr_node_ids = highest + 1;
+}
+#endif
+
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ unsigned long start, end, mask;
+ int last_nid = NUMA_NO_NODE;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
+/**
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * Return: the minimum PFN based on information provided via
+ * memblock_set_node().
+ */
+unsigned long __init find_min_pfn_with_active_regions(void)
+{
+ return PHYS_PFN(memblock_start_of_DRAM());
+}
+
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
+{
+ unsigned long totalpages = 0;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ unsigned long pages = end_pfn - start_pfn;
+
+ totalpages += pages;
+ if (pages)
+ node_set_state(nid, N_MEMORY);
+ }
+ return totalpages;
+}
+
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(void)
+{
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ /* save the state before borrow the nodemask */
+ nodemask_t saved_node_state = node_states[N_MEMORY];
+ unsigned long totalpages = early_calculate_totalpages();
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_region *r;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
+
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for_each_mem_region(r) {
+ if (!memblock_is_hotpluggable(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = PFN_DOWN(r->base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_mem_region(r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = memblock_get_region_node(r);
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
+
+ goto out2;
+ }
+
+ /*
+ * If kernelcore=nn% or movablecore=nn% was specified, calculate the
+ * amount of necessary memory.
+ */
+ if (required_kernelcore_percent)
+ required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
+ 10000UL;
+ if (required_movablecore_percent)
+ required_movablecore = (totalpages * 100 * required_movablecore_percent) /
+ 10000UL;
+
+ /*
+ * If movablecore= was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ required_movablecore = min(totalpages, required_movablecore);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /*
+ * If kernelcore was not specified or kernelcore size is larger
+ * than totalpages, there is no ZONE_MOVABLE.
+ */
+ if (!required_kernelcore || required_kernelcore >= totalpages)
+ goto out;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ unsigned long size_pages;
+
+ start_pfn = max(start_pfn, zone_movable_pfn[nid]);
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisfied
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisfied
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+out2:
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
+out:
+ /* restore the node_state */
+ node_states[N_MEMORY] = saved_node_state;
+}
+
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
+{
+ enum zone_type zone_type;
+
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (populated_zone(zone)) {
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
+ node_set_state(nid, N_NORMAL_MEMORY);
+ break;
+ }
+ }
+}
+
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
+/**
+ * free_area_init - Initialise all pg_data_t and zone data
+ * @max_zone_pfn: an array of max PFNs for each zone
+ *
+ * This will call free_area_init_node() for each active node in the system.
+ * Using the page ranges provided by memblock_set_node(), the size of each
+ * zone in each node and their holes is calculated. If the maximum PFN
+ * between two adjacent zones match, it is assumed that the zone is empty.
+ * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
+ * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
+ * starts where the previous one ended. For example, ZONE_DMA32 starts
+ * at arch_max_dma_pfn.
+ */
+void __init free_area_init(unsigned long *max_zone_pfn)
+{
+ unsigned long start_pfn, end_pfn;
+ int i, nid, zone;
+ bool descending;
+
+ /* Record where the zone boundaries are */
+ memset(arch_zone_lowest_possible_pfn, 0,
+ sizeof(arch_zone_lowest_possible_pfn));
+ memset(arch_zone_highest_possible_pfn, 0,
+ sizeof(arch_zone_highest_possible_pfn));
+
+ start_pfn = find_min_pfn_with_active_regions();
+ descending = arch_has_descending_max_zone_pfns();
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
+ continue;
+
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
+
+ start_pfn = end_pfn;
+ }
+
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes();
+
+ /* Print out the zone ranges */
+ pr_info("Zone ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ pr_info(" %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ pr_cont("empty\n");
+ else
+ pr_cont("[mem %#018Lx-%#018Lx]\n",
+ (u64)arch_zone_lowest_possible_pfn[i]
+ << PAGE_SHIFT,
+ ((u64)arch_zone_highest_possible_pfn[i]
+ << PAGE_SHIFT) - 1);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ pr_info("Movable zone start for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ pr_info(" Node %d: %#018Lx\n", i,
+ (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+ }
+
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
+ pr_info("Early memory node ranges\n");
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ ((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
+
+ /* Initialise every node */
+ mminit_verify_pageflags_layout();
+ setup_nr_node_ids();
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ free_area_init_node(nid);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat, nid);
+ }
+
+ memmap_init();
+}
+
+static int __init cmdline_parse_core(char *p, unsigned long *core,
+ unsigned long *percent)
+{
+ unsigned long long coremem;
+ char *endptr;
+
+ if (!p)
+ return -EINVAL;
+
+ /* Value may be a percentage of total memory, otherwise bytes */
+ coremem = simple_strtoull(p, &endptr, 0);
+ if (*endptr == '%') {
+ /* Paranoid check for percent values greater than 100 */
+ WARN_ON(coremem > 100);
+
+ *percent = coremem;
+ } else {
+ coremem = memparse(p, &p);
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ *core = coremem >> PAGE_SHIFT;
+ *percent = 0UL;
+ }
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
+ return cmdline_parse_core(p, &required_kernelcore,
+ &required_kernelcore_percent);
+}
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore,
+ &required_movablecore_percent);
+}
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+early_param("movablecore", cmdline_parse_movablecore);
+
+void adjust_managed_page_count(struct page *page, long count)
+{
+ atomic_long_add(count, &page_zone(page)->managed_pages);
+ totalram_pages_add(count);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(count);
+#endif
+}
+EXPORT_SYMBOL(adjust_managed_page_count);
+
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
+{
+ void *pos;
+ unsigned long pages = 0;
+
+ start = (void *)PAGE_ALIGN((unsigned long)start);
+ end = (void *)((unsigned long)end & PAGE_MASK);
+ for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+ struct page *page = virt_to_page(pos);
+ void *direct_map_addr;
+
+ /*
+ * 'direct_map_addr' might be different from 'pos'
+ * because some architectures' virt_to_page()
+ * work with aliases. Getting the direct map
+ * address ensures that we get a _writeable_
+ * alias for the memset().
+ */
+ direct_map_addr = page_address(page);
+ if ((unsigned int)poison <= 0xFF)
+ memset(direct_map_addr, poison, PAGE_SIZE);
+
+ free_reserved_page(page);
+ }
+
+ if (pages && s)
+ pr_info("Freeing %s memory: %ldK\n",
+ s, pages << (PAGE_SHIFT - 10));
+
+ return pages;
+}
+
+#ifdef CONFIG_HIGHMEM
+void free_highmem_page(struct page *page)
+{
+ __free_reserved_page(page);
+ totalram_pages_inc();
+ atomic_long_inc(&page_zone(page)->managed_pages);
+ totalhigh_pages_inc();
+}
+#endif
+
+
+void __init mem_init_print_info(const char *str)
+{
+ unsigned long physpages, codesize, datasize, rosize, bss_size;
+ unsigned long init_code_size, init_data_size;
+
+ physpages = get_num_physpages();
+ codesize = _etext - _stext;
+ datasize = _edata - _sdata;
+ rosize = __end_rodata - __start_rodata;
+ bss_size = __bss_stop - __bss_start;
+ init_data_size = __init_end - __init_begin;
+ init_code_size = _einittext - _sinittext;
+
+ /*
+ * Detect special cases and adjust section sizes accordingly:
+ * 1) .init.* may be embedded into .data sections
+ * 2) .init.text.* may be out of [__init_begin, __init_end],
+ * please refer to arch/tile/kernel/vmlinux.lds.S.
+ * 3) .rodata.* may be embedded into .text or .data sections.
+ */
+#define adj_init_size(start, end, size, pos, adj) \
+ do { \
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
+ size -= adj; \
+ } while (0)
+
+ adj_init_size(__init_begin, __init_end, init_data_size,
+ _sinittext, init_code_size);
+ adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+ adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+ adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+ adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef adj_init_size
+
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
+#ifdef CONFIG_HIGHMEM
+ ", %luK highmem"
+#endif
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
+#ifdef CONFIG_HIGHMEM
+ totalhigh_pages() << (PAGE_SHIFT - 10),
+#endif
+ str ? ", " : "", str ? str : "");
+}
+
+/**
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
+ *
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
+ * In the DMA zone, a significant percentage may be consumed by kernel image
+ * and other unfreeable allocations which can skew the watermarks badly. This
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
+ */
+void __init set_dma_reserve(unsigned long new_dma_reserve)
+{
+ dma_reserve = new_dma_reserve;
+}
+
+static int page_alloc_cpu_dead(unsigned int cpu)
+{
+
+ lru_add_drain_cpu(cpu);
+ drain_pages(cpu);
+
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
+ vm_events_fold_cpu(cpu);
+
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ cpu_vm_stats_fold(cpu);
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+int hashdist = HASHDIST_DEFAULT;
+
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
+void __init page_alloc_init(void)
+{
+ int ret;
+
+#ifdef CONFIG_NUMA
+ if (num_node_state(N_MEMORY) == 1)
+ hashdist = 0;
+#endif
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
+ "mm/page_alloc:dead", NULL,
+ page_alloc_cpu_dead);
+ WARN_ON(ret < 0);
+}
+
+/*
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
+ * or min_free_kbytes changes.
+ */
+static void calculate_totalreserve_pages(void)
+{
+ struct pglist_data *pgdat;
+ unsigned long reserve_pages = 0;
+ enum zone_type i, j;
+
+ for_each_online_pgdat(pgdat) {
+
+ pgdat->totalreserve_pages = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ long max = 0;
+ unsigned long managed_pages = zone_managed_pages(zone);
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (j = i; j < MAX_NR_ZONES; j++) {
+ if (zone->lowmem_reserve[j] > max)
+ max = zone->lowmem_reserve[j];
+ }
+
+ /* we treat the high watermark as reserved pages. */
+ max += high_wmark_pages(zone);
+
+ if (max > managed_pages)
+ max = managed_pages;
+
+ pgdat->totalreserve_pages += max;
+
+ reserve_pages += max;
+ }
+ }
+ totalreserve_pages = reserve_pages;
+}
+
+/*
+ * setup_per_zone_lowmem_reserve - called whenever
+ * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
+ * has a correct pages reserved value, so an adequate number of
+ * pages are left in the zone after a successful __alloc_pages().
+ */
+static void setup_per_zone_lowmem_reserve(void)
+{
+ struct pglist_data *pgdat;
+ enum zone_type i, j;
+
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
+ struct zone *zone = &pgdat->node_zones[i];
+ int ratio = sysctl_lowmem_reserve_ratio[i];
+ bool clear = !ratio || !zone_managed_pages(zone);
+ unsigned long managed_pages = 0;
+
+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
+ struct zone *upper_zone = &pgdat->node_zones[j];
+
+ managed_pages += zone_managed_pages(upper_zone);
+
+ if (clear)
+ zone->lowmem_reserve[j] = 0;
+ else
+ zone->lowmem_reserve[j] = managed_pages / ratio;
+ }
+ }
+ }
+
+ /* update totalreserve_pages */
+ calculate_totalreserve_pages();
+}
+
+static void __setup_per_zone_wmarks(void)
+{
+ unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long lowmem_pages = 0;
+ struct zone *zone;
+ unsigned long flags;
+
+ /* Calculate total number of !ZONE_HIGHMEM pages */
+ for_each_zone(zone) {
+ if (!is_highmem(zone))
+ lowmem_pages += zone_managed_pages(zone);
+ }
+
+ for_each_zone(zone) {
+ u64 tmp;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ tmp = (u64)pages_min * zone_managed_pages(zone);
+ do_div(tmp, lowmem_pages);
+ if (is_highmem(zone)) {
+ /*
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
+ * deltas control async page reclaim, and so should
+ * not be capped for highmem.
+ */
+ unsigned long min_pages;
+
+ min_pages = zone_managed_pages(zone) / 1024;
+ min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
+ zone->_watermark[WMARK_MIN] = min_pages;
+ } else {
+ /*
+ * If it's a lowmem zone, reserve a number of pages
+ * proportionate to the zone's size.
+ */
+ zone->_watermark[WMARK_MIN] = tmp;
+ }
+
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone_managed_pages(zone),
+ watermark_scale_factor, 10000));
+
+ zone->watermark_boost = 0;
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /* update totalreserve_pages */
+ calculate_totalreserve_pages();
+}
+
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
+ __setup_per_zone_wmarks();
+ spin_unlock(&lock);
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min). For large machines
+ * we want it large (256MB max). But it is not linear, because network
+ * bandwidth does not increase linearly with machine size. We use
+ *
+ * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ * min_free_kbytes = sqrt(lowmem_kbytes * 16)
+ *
+ * which yields
+ *
+ * 16MB: 512k
+ * 32MB: 724k
+ * 64MB: 1024k
+ * 128MB: 1448k
+ * 256MB: 2048k
+ * 512MB: 2896k
+ * 1024MB: 4096k
+ * 2048MB: 5792k
+ * 4096MB: 8192k
+ * 8192MB: 11584k
+ * 16384MB: 16384k
+ */
+int __meminit init_per_zone_wmark_min(void)
+{
+ unsigned long lowmem_kbytes;
+ int new_min_free_kbytes;
+
+ lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+ new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+
+ if (new_min_free_kbytes > user_min_free_kbytes) {
+ min_free_kbytes = new_min_free_kbytes;
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 262144)
+ min_free_kbytes = 262144;
+ } else {
+ pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+ new_min_free_kbytes, user_min_free_kbytes);
+ }
+ setup_per_zone_wmarks();
+ refresh_zone_stat_thresholds();
+ setup_per_zone_lowmem_reserve();
+
+#ifdef CONFIG_NUMA
+ setup_min_unmapped_ratio();
+ setup_min_slab_ratio();
+#endif
+
+ khugepaged_min_free_kbytes_update();
+
+ return 0;
+}
+postcore_initcall(init_per_zone_wmark_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
+ * that we can call two helper functions whenever min_free_kbytes
+ * changes.
+ */
+int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write) {
+ user_min_free_kbytes = min_free_kbytes;
+ setup_per_zone_wmarks();
+ }
+ return 0;
+}
+
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+static void setup_min_unmapped_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_pgdat(pgdat)
+ pgdat->min_unmapped_pages = 0;
+
+ for_each_zone(zone)
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
+ sysctl_min_unmapped_ratio) / 100;
+}
+
+
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ setup_min_unmapped_ratio();
+
+ return 0;
+}
+
+static void setup_min_slab_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
+ for_each_zone(zone)
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
+ sysctl_min_slab_ratio) / 100;
+}
+
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ setup_min_slab_ratio();
+
+ return 0;
+}
+#endif
+
+/*
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ * whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * minimum watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
+ */
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int i;
+
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
+ sysctl_lowmem_reserve_ratio[i] = 0;
+ }
+
+ setup_per_zone_lowmem_reserve();
+ return 0;
+}
+
+static void __zone_pcp_update(struct zone *zone)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
+}
+
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ struct zone *zone;
+ int old_percpu_pagelist_fraction;
+ int ret;
+
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
+
+ for_each_populated_zone(zone)
+ __zone_pcp_update(zone);
+out:
+ mutex_unlock(&pcp_batch_high_lock);
+ return ret;
+}
+
+#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
+/*
+ * Returns the number of pages that arch has reserved but
+ * is not known to alloc_large_system_hash().
+ */
+static unsigned long __init arch_reserved_kernel_pages(void)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ * quantity of entries
+ * - limit is the number of hash buckets, not the total allocation size
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+ unsigned long bucketsize,
+ unsigned long numentries,
+ int scale,
+ int flags,
+ unsigned int *_hash_shift,
+ unsigned int *_hash_mask,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned long long max = high_limit;
+ unsigned long log2qty, size;
+ void *table = NULL;
+ gfp_t gfp_flags;
+ bool virt;
+
+ /* allow the kernel cmdline to have a say */
+ if (!numentries) {
+ /* round applicable memory size up to nearest megabyte */
+ numentries = nr_kernel_pages;
+ numentries -= arch_reserved_kernel_pages();
+
+ /* It isn't necessary when PAGE_SIZE >= 1MB */
+ if (PAGE_SHIFT < 20)
+ numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
+ /* limit to 1 bucket per 2^scale bytes of low memory */
+ if (scale > PAGE_SHIFT)
+ numentries >>= (scale - PAGE_SHIFT);
+ else
+ numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
+ }
+ numentries = roundup_pow_of_two(numentries);
+
+ /* limit allocation size to 1/16 total memory by default */
+ if (max == 0) {
+ max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+ do_div(max, bucketsize);
+ }
+ max = min(max, 0x80000000ULL);
+
+ if (numentries < low_limit)
+ numentries = low_limit;
+ if (numentries > max)
+ numentries = max;
+
+ log2qty = ilog2(numentries);
+
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
+ do {
+ virt = false;
+ size = bucketsize << log2qty;
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
+ else
+ table = memblock_alloc_raw(size,
+ SMP_CACHE_BYTES);
+ } else if (get_order(size) >= MAX_ORDER || hashdist) {
+ table = __vmalloc(size, gfp_flags);
+ virt = true;
+ } else {
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table which
+ * alloc_pages_exact() automatically does
+ */
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
+ }
+ } while (!table && size > PAGE_SIZE && --log2qty);
+
+ if (!table)
+ panic("Failed to allocate %s hash table\n", tablename);
+
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ virt ? "vmalloc" : "linear");
+
+ if (_hash_shift)
+ *_hash_shift = log2qty;
+ if (_hash_mask)
+ *_hash_mask = (1 << log2qty) - 1;
+
+ return table;
+}
+
+/*
+ * This function checks whether pageblock includes unmovable pages or not.
+ *
+ * PageLRU check without isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
+ */
+struct page *has_unmovable_pages(struct zone *zone, struct page *page,
+ int migratetype, int flags)
+{
+ unsigned long iter = 0;
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset = pfn % pageblock_nr_pages;
+
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return NULL;
+
+ return page;
+ }
+
+ for (; iter < pageblock_nr_pages - offset; iter++) {
+ if (!pfn_valid_within(pfn + iter))
+ continue;
+
+ page = pfn_to_page(pfn + iter);
+
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
+ if (PageReserved(page))
+ return page;
+
+ /*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
+ * We need not scan over tail pages because we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page) || PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
+
+ if (PageHuge(page)) {
+ if (!hugepage_migration_supported(page_hstate(head)))
+ return page;
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
+ return page;
+ }
+
+ skip_pages = compound_nr(head) - (page - head);
+ iter += skip_pages - 1;
+ continue;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_refcount is zero at all time.
+ */
+ if (!page_ref_count(page)) {
+ if (PageBuddy(page))
+ iter += (1 << buddy_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ continue;
+
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
+ if (__PageMovable(page) || PageLRU(page))
+ continue;
+
+ /*
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
+ */
+ return page;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_CONTIG_ALLOC
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+ return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages) - 1);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+ return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages));
+}
+
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
+{
+ /* This function is based on compact_zone() from compaction.c. */
+ unsigned int nr_reclaimed;
+ unsigned long pfn = start;
+ unsigned int tries = 0;
+ int ret = 0;
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(cc->zone),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ migrate_prep();
+
+ while (pfn < end || !list_empty(&cc->migratepages)) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (list_empty(&cc->migratepages)) {
+ cc->nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc, pfn, end);
+ if (!pfn) {
+ ret = -EINTR;
+ break;
+ }
+ tries = 0;
+ } else if (++tries == 5) {
+ ret = ret < 0 ? ret : -EBUSY;
+ break;
+ }
+
+ nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+ &cc->migratepages);
+ cc->nr_migratepages -= nr_reclaimed;
+
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+ }
+ if (ret < 0) {
+ putback_movable_pages(&cc->migratepages);
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start: start PFN to allocate
+ * @end: one-past-the-last PFN to allocate
+ * @migratetype: migratetype of the underlaying pageblocks (either
+ * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
+ * in range must have the same migratetype and it must
+ * be either of the two.
+ * @gfp_mask: GFP mask to use during compaction
+ *
+ * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ * aligned. The PFN range must belong to a single zone.
+ *
+ * The first thing this routine does is attempt to MIGRATE_ISOLATE all
+ * pageblocks in the range. Once isolated, the pageblocks should not
+ * be modified by others.
+ *
+ * Return: zero on success or negative error code. On success all
+ * pages which PFN is in [start, end) are allocated for the caller and
+ * need to be freed with free_contig_range().
+ */
+int alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype, gfp_t gfp_mask)
+{
+ unsigned long outer_start, outer_end;
+ unsigned int order;
+ int ret = 0;
+
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .no_set_skip_hint = true,
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .alloc_contig = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * What we do here is we mark all pageblocks in range as
+ * MIGRATE_ISOLATE. Because pageblock and max order pages may
+ * have different sizes, and due to the way page allocator
+ * work, we align the range to biggest of the two pages so
+ * that page allocator won't try to merge buddies from
+ * different pageblocks and change MIGRATE_ISOLATE to some
+ * other migration type.
+ *
+ * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+ * migrate the pages from an unaligned range (ie. pages that
+ * we are interested in). This will put all the pages in
+ * range back to page allocator as MIGRATE_ISOLATE.
+ *
+ * When this is done, we take the pages in range from page
+ * allocator removing them from the buddy system. This way
+ * page allocator will never consider using them.
+ *
+ * This lets us mark the pageblocks back as
+ * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+ * aligned range but not in the unaligned, original range are
+ * put back to page allocator so that buddy can use them.
+ */
+
+ ret = start_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * In case of -EBUSY, we'd like to know which page causes problem.
+ * So, just fall through. test_pages_isolated() has a tracepoint
+ * which will report the busy page.
+ *
+ * It is possible that busy pages could become available before
+ * the call to test_pages_isolated, and the range will actually be
+ * allocated. So, if we fall through be sure to clear ret so that
+ * -EBUSY is not accidentally used or returned to caller.
+ */
+ ret = __alloc_contig_migrate_range(&cc, start, end);
+ if (ret && ret != -EBUSY)
+ goto done;
+ ret =0;
+
+ /*
+ * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * aligned blocks that are marked as MIGRATE_ISOLATE. What's
+ * more, all pages in [start, end) are free in page allocator.
+ * What we are going to do is to allocate all pages from
+ * [start, end) (that is remove them from page allocator).
+ *
+ * The only problem is that pages at the beginning and at the
+ * end of interesting range may be not aligned with pages that
+ * page allocator holds, ie. they can be part of higher order
+ * pages. Because of this, we reserve the bigger range and
+ * once this is done free the pages we are not interested in.
+ *
+ * We don't have to hold zone->lock here because the pages are
+ * isolated thus they won't get removed from buddy.
+ */
+
+ lru_add_drain_all();
+
+ order = 0;
+ outer_start = start;
+ while (!PageBuddy(pfn_to_page(outer_start))) {
+ if (++order >= MAX_ORDER) {
+ outer_start = start;
+ break;
+ }
+ outer_start &= ~0UL << order;
+ }
+
+ if (outer_start != start) {
+ order = buddy_order(pfn_to_page(outer_start));
+
+ /*
+ * outer_start page could be small order buddy page and
+ * it doesn't include start page. Adjust outer_start
+ * in this case to report failed page properly
+ * on tracepoint in test_pages_isolated()
+ */
+ if (outer_start + (1UL << order) <= start)
+ outer_start = start;
+ }
+
+ /* Make sure the range is really isolated. */
+ if (test_pages_isolated(outer_start, end, 0)) {
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
+ __func__, outer_start, end);
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /* Grab isolated pages from freelists. */
+ outer_end = isolate_freepages_range(&cc, outer_start, end);
+ if (!outer_end) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+
+done:
+ undo_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ return ret;
+}
+EXPORT_SYMBOL(alloc_contig_range);
+
+static int __alloc_contig_pages(unsigned long start_pfn,
+ unsigned long nr_pages, gfp_t gfp_mask)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ gfp_mask);
+}
+
+static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ page = pfn_to_online_page(i);
+ if (!page)
+ return false;
+
+ if (page_zone(page) != z)
+ return false;
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+/**
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * @nr_pages: Number of contiguous pages to allocate
+ * @gfp_mask: GFP mask to limit search and used during compaction
+ * @nid: Target node
+ * @nodemask: Mask for other possible nodes
+ *
+ * This routine is a wrapper around alloc_contig_range(). It scans over zones
+ * on an applicable zonelist to find a contiguous pfn range which can then be
+ * tried for allocation with alloc_contig_range(). This routine is intended
+ * for allocation requests which can not be fulfilled with the buddy allocator.
+ *
+ * The allocated memory is always aligned to a page boundary. If nr_pages is a
+ * power of two then the alignment is guaranteed to be to the given nr_pages
+ * (e.g. 1GB request would be aligned to 1GB).
+ *
+ * Allocated pages can be freed with free_contig_range() or by manually calling
+ * __free_page() on each allocated page.
+ *
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
+ */
+struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ unsigned long ret, pfn, flags;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(gfp_mask), nodemask) {
+ spin_lock_irqsave(&zone->lock, flags);
+
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_contig_pages(pfn, nr_pages,
+ gfp_mask);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
+
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+{
+ unsigned int count = 0;
+
+ for (; nr_pages--; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ count += page_count(page) != 1;
+ __free_page(page);
+ }
+ WARN(count != 0, "%d pages are still in use!\n", count);
+}
+EXPORT_SYMBOL(free_contig_range);
+
+/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalulated.
+ */
+void __meminit zone_pcp_update(struct zone *zone)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ __zone_pcp_update(zone);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
+void zone_pcp_reset(struct zone *zone)
+{
+ unsigned long flags;
+ int cpu;
+ struct per_cpu_pageset *pset;
+
+ /* avoid races with drain_pages() */
+ local_irq_save(flags);
+ if (zone->pageset != &boot_pageset) {
+ for_each_online_cpu(cpu) {
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ drain_zonestat(zone, pset);
+ }
+ free_percpu(zone->pageset);
+ zone->pageset = &boot_pageset;
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
+ */
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn = start_pfn;
+ struct page *page;
+ struct zone *zone;
+ unsigned int order;
+ unsigned long flags;
+
+ offline_mem_sections(pfn, end_pfn);
+ zone = page_zone(pfn_to_page(pfn));
+ spin_lock_irqsave(&zone->lock, flags);
+ while (pfn < end_pfn) {
+ page = pfn_to_page(pfn);
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+ pfn++;
+ continue;
+ }
+ /*
+ * At this point all remaining PageOffline() pages have a
+ * reference count of 0 and can simply be skipped.
+ */
+ if (PageOffline(page)) {
+ BUG_ON(page_count(page));
+ BUG_ON(PageBuddy(page));
+ pfn++;
+ continue;
+ }
+
+ BUG_ON(page_count(page));
+ BUG_ON(!PageBuddy(page));
+ order = buddy_order(page);
+ del_page_from_free_list(page, zone, order);
+ pfn += (1 << order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
+
+bool is_free_buddy_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return order < MAX_ORDER;
+}
+
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
+ */
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+ page = next_page;
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+ bool ret = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
+
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
+
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -1, migratetype);
+ ret = true;
+ break;
+ }
+ if (page_count(page_head) > 0)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
+
+ if (managed_zone(zone))
+ return true;
+ }
+ return false;
+}
+#endif /* CONFIG_ZONE_DMA */
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000..b24a60b28
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Lockless hierarchical page accounting & limiting
+ *
+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/page_counter.h>
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/page.h>
+
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
+{
+ unsigned long protected, old_protected;
+ unsigned long low, min;
+ long delta;
+
+ if (!c->parent)
+ return;
+
+ min = READ_ONCE(c->min);
+ if (min || atomic_long_read(&c->min_usage)) {
+ protected = min(usage, min);
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }
+
+ low = READ_ONCE(c->low);
+ if (low || atomic_long_read(&c->low_usage)) {
+ protected = min(usage, low);
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
+}
+
+/**
+ * page_counter_cancel - take pages out of the local counter
+ * @counter: counter
+ * @nr_pages: number of pages to cancel
+ */
+void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
+{
+ long new;
+
+ new = atomic_long_sub_return(nr_pages, &counter->usage);
+ propagate_protected_usage(counter, new);
+ /* More uncharges than charges? */
+ WARN_ON_ONCE(new < 0);
+}
+
+/**
+ * page_counter_charge - hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * NOTE: This does not consider any configured counter limits.
+ */
+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent) {
+ long new;
+
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ propagate_protected_usage(c, new);
+ /*
+ * This is indeed racy, but we can live with some
+ * inaccuracy in the watermark.
+ */
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
+}
+
+/**
+ * page_counter_try_charge - try to hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ * @fail: points first counter to hit its limit, if any
+ *
+ * Returns %true on success, or %false and @fail if the counter or one
+ * of its ancestors has hit its configured limit.
+ */
+bool page_counter_try_charge(struct page_counter *counter,
+ unsigned long nr_pages,
+ struct page_counter **fail)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent) {
+ long new;
+ /*
+ * Charge speculatively to avoid an expensive CAS. If
+ * a bigger charge fails, it might falsely lock out a
+ * racing smaller charge and send it into reclaim
+ * early, but the error is limited to the difference
+ * between the two sizes, which is less than 2M/4M in
+ * case of a THP locking out a regular page charge.
+ *
+ * The atomic_long_add_return() implies a full memory
+ * barrier between incrementing the count and reading
+ * the limit. When racing with page_counter_set_max(),
+ * we either see the new limit or the setter sees the
+ * counter has changed and retries.
+ */
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ if (new > c->max) {
+ atomic_long_sub(nr_pages, &c->usage);
+ propagate_protected_usage(c, new);
+ /*
+ * This is racy, but we can live with some
+ * inaccuracy in the failcnt which is only used
+ * to report stats.
+ */
+ data_race(c->failcnt++);
+ *fail = c;
+ goto failed;
+ }
+ propagate_protected_usage(c, new);
+ /*
+ * Just like with failcnt, we can live with some
+ * inaccuracy in the watermark.
+ */
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
+ return true;
+
+failed:
+ for (c = counter; c != *fail; c = c->parent)
+ page_counter_cancel(c, nr_pages);
+
+ return false;
+}
+
+/**
+ * page_counter_uncharge - hierarchically uncharge pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent)
+ page_counter_cancel(c, nr_pages);
+}
+
+/**
+ * page_counter_set_max - set the maximum number of pages allowed
+ * @counter: counter
+ * @nr_pages: limit to set
+ *
+ * Returns 0 on success, -EBUSY if the current number of pages on the
+ * counter already exceeds the specified limit.
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
+{
+ for (;;) {
+ unsigned long old;
+ long usage;
+
+ /*
+ * Update the limit while making sure that it's not
+ * below the concurrently-changing counter value.
+ *
+ * The xchg implies two full memory barriers before
+ * and after, so the read-swap-read is ordered and
+ * ensures coherency with page_counter_try_charge():
+ * that function modifies the count before checking
+ * the limit, so if it sees the old limit, we see the
+ * modified counter and retry.
+ */
+ usage = atomic_long_read(&counter->usage);
+
+ if (usage > nr_pages)
+ return -EBUSY;
+
+ old = xchg(&counter->max, nr_pages);
+
+ if (atomic_long_read(&counter->usage) <= usage)
+ return 0;
+
+ counter->max = old;
+ cond_resched();
+ }
+}
+
+/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ WRITE_ONCE(counter->min, nr_pages);
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ WRITE_ONCE(counter->low, nr_pages);
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
+ * page_counter_memparse - memparse() for page counter limits
+ * @buf: string to parse
+ * @max: string meaning maximum possible value
+ * @nr_pages: returns the result in number of pages
+ *
+ * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
+ * limited to %PAGE_COUNTER_MAX.
+ */
+int page_counter_memparse(const char *buf, const char *max,
+ unsigned long *nr_pages)
+{
+ char *end;
+ u64 bytes;
+
+ if (!strcmp(buf, max)) {
+ *nr_pages = PAGE_COUNTER_MAX;
+ return 0;
+ }
+
+ bytes = memparse(buf, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
+
+ return 0;
+}
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000..a3616f7a0
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/memblock.h>
+#include <linux/page_ext.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
+#include <linux/page_owner.h>
+#include <linux/page_idle.h>
+
+/*
+ * struct page extension
+ *
+ * This is the feature to manage memory for extended data per page.
+ *
+ * Until now, we must modify struct page itself to store extra data per page.
+ * This requires rebuilding the kernel and it is really time consuming process.
+ * And, sometimes, rebuild is impossible due to third party module dependency.
+ * At last, enlarging struct page could cause un-wanted system behaviour change.
+ *
+ * This feature is intended to overcome above mentioned problems. This feature
+ * allocates memory for extended data per page in certain place rather than
+ * the struct page itself. This memory can be accessed by the accessor
+ * functions provided by this code. During the boot process, it checks whether
+ * allocation of huge chunk of memory is needed or not. If not, it avoids
+ * allocating memory at all. With this advantage, we can include this feature
+ * into the kernel in default and can avoid rebuild and solve related problems.
+ *
+ * To help these things to work well, there are two callbacks for clients. One
+ * is the need callback which is mandatory if user wants to avoid useless
+ * memory allocation at boot-time. The other is optional, init callback, which
+ * is used to do proper initialization after memory is allocated.
+ *
+ * The need callback is used to decide whether extended memory allocation is
+ * needed or not. Sometimes users want to deactivate some features in this
+ * boot and extra memory would be unneccessary. In this case, to avoid
+ * allocating huge chunk of memory, each clients represent their need of
+ * extra memory through the need callback. If one of the need callbacks
+ * returns true, it means that someone needs extra memory so that
+ * page extension core should allocates memory for page extension. If
+ * none of need callbacks return true, memory isn't needed at all in this boot
+ * and page extension core can skip to allocate memory. As result,
+ * none of memory is wasted.
+ *
+ * When need callback returns true, page_ext checks if there is a request for
+ * extra memory through size in struct page_ext_operations. If it is non-zero,
+ * extra space is allocated for each page_ext entry and offset is returned to
+ * user through offset in struct page_ext_operations.
+ *
+ * The init callback is used to do proper initialization after page extension
+ * is completely initialized. In sparse memory system, extra memory is
+ * allocated some time later than memmap is allocated. In other words, lifetime
+ * of memory for page extension isn't same with memmap for struct page.
+ * Therefore, clients can't store extra data until page extension is
+ * initialized, even if pages are allocated and used freely. This could
+ * cause inadequate state of extra data per page, so, to prevent it, client
+ * can utilize this callback to initialize the state of it correctly.
+ */
+
+static struct page_ext_operations *page_ext_ops[] = {
+#ifdef CONFIG_PAGE_OWNER
+ &page_owner_ops,
+#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+ &page_idle_ops,
+#endif
+};
+
+unsigned long page_ext_size = sizeof(struct page_ext);
+
+static unsigned long total_usage;
+
+static bool __init invoke_need_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+ bool need = false;
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
+ page_ext_ops[i]->offset = page_ext_size;
+ page_ext_size += page_ext_ops[i]->size;
+ need = true;
+ }
+ }
+
+ return need;
+}
+
+static void __init invoke_init_callbacks(void)
+{
+ int i;
+ int entries = ARRAY_SIZE(page_ext_ops);
+
+ for (i = 0; i < entries; i++) {
+ if (page_ext_ops[i]->init)
+ page_ext_ops[i]->init();
+ }
+}
+
+static inline struct page_ext *get_entry(void *base, unsigned long index)
+{
+ return base + page_ext_size * index;
+}
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+ pgdat->node_page_ext = NULL;
+}
+
+struct page_ext *lookup_page_ext(const struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long index;
+ struct page_ext *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_ext;
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (unlikely(!base))
+ return NULL;
+ index = pfn - round_down(node_start_pfn(page_to_nid(page)),
+ MAX_ORDER_NR_PAGES);
+ return get_entry(base, index);
+}
+
+static int __init alloc_node_page_ext(int nid)
+{
+ struct page_ext *base;
+ unsigned long table_size;
+ unsigned long nr_pages;
+
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+ if (!nr_pages)
+ return 0;
+
+ /*
+ * Need extra space if node range is not aligned with
+ * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
+ * checks buddy's status, range could be out of exact node range.
+ */
+ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
+ !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
+ nr_pages += MAX_ORDER_NR_PAGES;
+
+ table_size = page_ext_size * nr_pages;
+
+ base = memblock_alloc_try_nid(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!base)
+ return -ENOMEM;
+ NODE_DATA(nid)->node_page_ext = base;
+ total_usage += table_size;
+ return 0;
+}
+
+void __init page_ext_init_flatmem(void)
+{
+
+ int nid, fail;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_ext(nid);
+ if (fail)
+ goto fail;
+ }
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+fail:
+ pr_crit("allocation of page_ext failed.\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_ext *lookup_page_ext(const struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_ext arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (!section->page_ext)
+ return NULL;
+ return get_entry(section->page_ext, pfn);
+}
+
+static void *__meminit alloc_page_ext(size_t size, int nid)
+{
+ gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+ void *addr = NULL;
+
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
+ return addr;
+ }
+
+ addr = vzalloc_node(size, nid);
+
+ return addr;
+}
+
+static int __meminit init_section_page_ext(unsigned long pfn, int nid)
+{
+ struct mem_section *section;
+ struct page_ext *base;
+ unsigned long table_size;
+
+ section = __pfn_to_section(pfn);
+
+ if (section->page_ext)
+ return 0;
+
+ table_size = page_ext_size * PAGES_PER_SECTION;
+ base = alloc_page_ext(table_size, nid);
+
+ /*
+ * The value stored in section->page_ext is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
+
+ if (!base) {
+ pr_err("page ext allocation failure\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
+ section->page_ext = (void *)base - page_ext_size * pfn;
+ total_usage += table_size;
+ return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_ext(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size;
+
+ table_size = page_ext_size * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
+ free_pages_exact(addr, table_size);
+ }
+}
+
+static void __free_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ struct page_ext *base;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ base = get_entry(ms->page_ext, pfn);
+ free_page_ext(base);
+ ms->page_ext = NULL;
+}
+
+static int __meminit online_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
+{
+ unsigned long start, end, pfn;
+ int fail = 0;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ if (nid == NUMA_NO_NODE) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
+
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
+ fail = init_section_page_ext(pfn, nid);
+ if (!fail)
+ return 0;
+
+ /* rollback */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+
+ return -ENOMEM;
+}
+
+static int __meminit offline_page_ext(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
+{
+ unsigned long start, end, pfn;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_ext(pfn);
+ return 0;
+
+}
+
+static int __meminit page_ext_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = online_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_OFFLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_CANCEL_ONLINE:
+ offline_page_ext(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_GOING_OFFLINE:
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+#endif
+
+void __init page_ext_init(void)
+{
+ unsigned long pfn;
+ int nid;
+
+ if (!invoke_need_callbacks())
+ return;
+
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn; pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_ext(pfn, nid))
+ goto oom;
+ cond_resched();
+ }
+ }
+ hotplug_memory_notifier(page_ext_callback, 0);
+ pr_info("allocated %ld bytes of page_ext\n", total_usage);
+ invoke_init_callbacks();
+ return;
+
+oom:
+ panic("Out of memory");
+}
+
+void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
+{
+}
+
+#endif
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644
index 000000000..057c61df1
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE sizeof(u64)
+#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ pg_data_t *pgdat;
+
+ if (!page || !PageLRU(page) ||
+ !get_page_unless_zero(page))
+ return NULL;
+
+ pgdat = page_pgdat(page);
+ spin_lock_irq(&pgdat->lru_lock);
+ if (unlikely(!PageLRU(page))) {
+ put_page(page);
+ page = NULL;
+ }
+ spin_unlock_irq(&pgdat->lru_lock);
+ return page;
+}
+
+static bool page_idle_clear_pte_refs_one(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = addr,
+ };
+ bool referenced = false;
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ /*
+ * For PTE-mapped THP, one sub page is referenced,
+ * the whole THP is referenced.
+ */
+ if (ptep_clear_young_notify(vma, addr, pvmw.pte))
+ referenced = true;
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
+ referenced = true;
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ if (referenced) {
+ clear_page_idle(page);
+ /*
+ * We cleared the referenced bit in a mapping to this page. To
+ * avoid interference with page reclaim, mark it young so that
+ * page_referenced() will return > 0.
+ */
+ set_page_young(page);
+ }
+ return true;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+ /*
+ * Since rwc.arg is unused, rwc is effectively immutable, so we
+ * can make it static const to save some cycles and stack.
+ */
+ static const struct rmap_walk_control rwc = {
+ .rmap_one = page_idle_clear_pte_refs_one,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!page_mapped(page) ||
+ !page_rmapping(page))
+ return;
+
+ need_lock = !PageAnon(page) || PageKsm(page);
+ if (need_lock && !trylock_page(page))
+ return;
+
+ rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+ if (need_lock)
+ unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t pos, size_t count)
+{
+ u64 *out = (u64 *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ int bit;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ pfn = pos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return 0;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = max_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % BITMAP_CHUNK_BITS;
+ if (!bit)
+ *out = 0ULL;
+ page = page_idle_get_page(pfn);
+ if (page) {
+ if (page_is_idle(page)) {
+ /*
+ * The page might have been referenced via a
+ * pte, in which case it is not idle. Clear
+ * refs and recheck.
+ */
+ page_idle_clear_pte_refs(page);
+ if (page_is_idle(page))
+ *out |= 1ULL << bit;
+ }
+ put_page(page);
+ }
+ if (bit == BITMAP_CHUNK_BITS - 1)
+ out++;
+ cond_resched();
+ }
+ return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t pos, size_t count)
+{
+ const u64 *in = (u64 *)buf;
+ struct page *page;
+ unsigned long pfn, end_pfn;
+ int bit;
+
+ if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+ return -EINVAL;
+
+ pfn = pos * BITS_PER_BYTE;
+ if (pfn >= max_pfn)
+ return -ENXIO;
+
+ end_pfn = pfn + count * BITS_PER_BYTE;
+ if (end_pfn > max_pfn)
+ end_pfn = max_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ bit = pfn % BITMAP_CHUNK_BITS;
+ if ((*in >> bit) & 1) {
+ page = page_idle_get_page(pfn);
+ if (page) {
+ page_idle_clear_pte_refs(page);
+ set_page_idle(page);
+ put_page(page);
+ }
+ }
+ if (bit == BITMAP_CHUNK_BITS - 1)
+ in++;
+ cond_resched();
+ }
+ return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+ __BIN_ATTR(bitmap, 0600,
+ page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+ &page_idle_bitmap_attr,
+ NULL,
+};
+
+static const struct attribute_group page_idle_attr_group = {
+ .bin_attrs = page_idle_bin_attrs,
+ .name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+ return true;
+}
+struct page_ext_operations page_idle_ops = {
+ .need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+ int err;
+
+ err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+ if (err) {
+ pr_err("page_idle: register sysfs failed\n");
+ return err;
+ }
+ return 0;
+}
+subsys_initcall(page_idle_init);
diff --git a/mm/page_io.c b/mm/page_io.c
new file mode 100644
index 000000000..f0ada4455
--- /dev/null
+++ b/mm/page_io.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/page_io.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95,
+ * Asynchronous swapping added 30.12.95. Stephen Tweedie
+ * Removed race in async swapping. 14.4.1996. Bruno Haible
+ * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
+ * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/swapops.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/frontswap.h>
+#include <linux/blkdev.h>
+#include <linux/psi.h>
+#include <linux/uio.h>
+#include <linux/sched/task.h>
+
+static struct bio *get_swap_bio(gfp_t gfp_flags,
+ struct page *page, bio_end_io_t end_io)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(gfp_flags, 1);
+ if (bio) {
+ struct block_device *bdev;
+
+ bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+ bio_set_dev(bio, bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_end_io = end_io;
+
+ bio_add_page(bio, page, thp_size(page), 0);
+ }
+ return bio;
+}
+
+void end_swap_bio_write(struct bio *bio)
+{
+ struct page *page = bio_first_page_all(bio);
+
+ if (bio->bi_status) {
+ SetPageError(page);
+ /*
+ * We failed to write the page out to swap-space.
+ * Re-dirty the page in order to avoid it being reclaimed.
+ * Also print a dire warning that things will go BAD (tm)
+ * very quickly.
+ *
+ * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+ */
+ set_page_dirty(page);
+ pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ ClearPageReclaim(page);
+ }
+ end_page_writeback(page);
+ bio_put(bio);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ struct page *page = bio_first_page_all(bio);
+ struct task_struct *waiter = bio->bi_private;
+
+ if (bio->bi_status) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto out;
+ }
+
+ SetPageUptodate(page);
+out:
+ unlock_page(page);
+ WRITE_ONCE(bio->bi_private, NULL);
+ bio_put(bio);
+ if (waiter) {
+ blk_wake_io_task(waiter);
+ put_task_struct(waiter);
+ }
+}
+
+int generic_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file,
+ sector_t *span)
+{
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned blocks_per_page;
+ unsigned long page_no;
+ unsigned blkbits;
+ sector_t probe_block;
+ sector_t last_block;
+ sector_t lowest_block = -1;
+ sector_t highest_block = 0;
+ int nr_extents = 0;
+ int ret;
+
+ blkbits = inode->i_blkbits;
+ blocks_per_page = PAGE_SIZE >> blkbits;
+
+ /*
+ * Map all the blocks into the extent tree. This code doesn't try
+ * to be very smart.
+ */
+ probe_block = 0;
+ page_no = 0;
+ last_block = i_size_read(inode) >> blkbits;
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
+ unsigned block_in_page;
+ sector_t first_block;
+
+ cond_resched();
+
+ first_block = probe_block;
+ ret = bmap(inode, &first_block);
+ if (ret || !first_block)
+ goto bad_bmap;
+
+ /*
+ * It must be PAGE_SIZE aligned on-disk
+ */
+ if (first_block & (blocks_per_page - 1)) {
+ probe_block++;
+ goto reprobe;
+ }
+
+ for (block_in_page = 1; block_in_page < blocks_per_page;
+ block_in_page++) {
+ sector_t block;
+
+ block = probe_block + block_in_page;
+ ret = bmap(inode, &block);
+ if (ret || !block)
+ goto bad_bmap;
+
+ if (block != first_block + block_in_page) {
+ /* Discontiguity */
+ probe_block++;
+ goto reprobe;
+ }
+ }
+
+ first_block >>= (PAGE_SHIFT - blkbits);
+ if (page_no) { /* exclude the header page */
+ if (first_block < lowest_block)
+ lowest_block = first_block;
+ if (first_block > highest_block)
+ highest_block = first_block;
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1, first_block);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
+ page_no++;
+ probe_block += blocks_per_page;
+reprobe:
+ continue;
+ }
+ ret = nr_extents;
+ *span = 1 + highest_block - lowest_block;
+ if (page_no == 0)
+ page_no = 1; /* force Empty message */
+ sis->max = page_no;
+ sis->pages = page_no - 1;
+ sis->highest_bit = page_no - 1;
+out:
+ return ret;
+bad_bmap:
+ pr_err("swapon: swapfile has holes\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
+int swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int ret = 0;
+
+ if (try_to_free_swap(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /*
+ * Arch code may have to preserve more data than just the page
+ * contents, e.g. memory tags.
+ */
+ ret = arch_prepare_to_swap(page);
+ if (ret) {
+ set_page_dirty(page);
+ unlock_page(page);
+ goto out;
+ }
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
+ ret = __swap_writepage(page, wbc, end_swap_bio_write);
+out:
+ return ret;
+}
+
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (unlikely(PageTransHuge(page)))
+ count_vm_event(THP_SWPOUT);
+#endif
+ count_vm_events(PSWPOUT, thp_nr_pages(page));
+}
+
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!page->mem_cgroup)
+ return;
+
+ rcu_read_lock();
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ bio_associate_blkg_from_css(bio, css);
+ rcu_read_unlock();
+}
+#else
+#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
+
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ bio_end_io_t end_write_func)
+{
+ struct bio *bio;
+ int ret;
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (data_race(sis->flags & SWP_FS_OPS)) {
+ struct kiocb kiocb;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct bio_vec bv = {
+ .bv_page = page,
+ .bv_len = PAGE_SIZE,
+ .bv_offset = 0
+ };
+ struct iov_iter from;
+
+ iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
+ init_sync_kiocb(&kiocb, swap_file);
+ kiocb.ki_pos = page_file_offset(page);
+
+ set_page_writeback(page);
+ unlock_page(page);
+ ret = mapping->a_ops->direct_IO(&kiocb, &from);
+ if (ret == PAGE_SIZE) {
+ count_vm_event(PSWPOUT);
+ ret = 0;
+ } else {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * rotate_reclaimable_page but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
+ page_file_offset(page));
+ }
+ end_page_writeback(page);
+ return ret;
+ }
+
+ ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
+ if (!ret) {
+ count_swpout_vm_event(page);
+ return 0;
+ }
+
+ bio = get_swap_bio(GFP_NOIO, page, end_write_func);
+ if (bio == NULL) {
+ set_page_dirty(page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+ bio_associate_blkg_from_page(bio, page);
+ count_swpout_vm_event(page);
+ set_page_writeback(page);
+ unlock_page(page);
+ submit_bio(bio);
+
+ return 0;
+}
+
+int swap_readpage(struct page *page, bool synchronous)
+{
+ struct bio *bio;
+ int ret = 0;
+ struct swap_info_struct *sis = page_swap_info(page);
+ blk_qc_t qc;
+ struct gendisk *disk;
+ unsigned long pflags;
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
+
+ /*
+ * Count submission time as memory stall. When the device is congested,
+ * or the submitting cgroup IO-throttled, submission can be a
+ * significant part of overall IO time.
+ */
+ psi_memstall_enter(&pflags);
+
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
+
+ if (data_race(sis->flags & SWP_FS_OPS)) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->readpage(swap_file, page);
+ if (!ret)
+ count_vm_event(PSWPIN);
+ goto out;
+ }
+
+ if (sis->flags & SWP_SYNCHRONOUS_IO) {
+ ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ if (!ret) {
+ count_vm_event(PSWPIN);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
+ if (bio == NULL) {
+ unlock_page(page);
+ ret = -ENOMEM;
+ goto out;
+ }
+ disk = bio->bi_disk;
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (synchronous) {
+ bio->bi_opf |= REQ_HIPRI;
+ get_task_struct(current);
+ bio->bi_private = current;
+ }
+ count_vm_event(PSWPIN);
+ bio_get(bio);
+ qc = submit_bio(bio);
+ while (synchronous) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(bio->bi_private))
+ break;
+
+ if (!blk_poll(disk->queue, qc, true))
+ blk_io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ bio_put(bio);
+
+out:
+ psi_memstall_leave(&pflags);
+ return ret;
+}
+
+int swap_set_page_dirty(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (data_race(sis->flags & SWP_FS_OPS)) {
+ struct address_space *mapping = sis->swap_file->f_mapping;
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ return mapping->a_ops->set_page_dirty(page);
+ } else {
+ return __set_page_dirty_no_writeback(page);
+ }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000..abbf42214
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/page_isolation.c
+ */
+
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include <linux/memory.h>
+#include <linux/hugetlb.h>
+#include <linux/page_owner.h>
+#include <linux/migrate.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_isolation.h>
+
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
+{
+ struct zone *zone = page_zone(page);
+ struct page *unmovable;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /*
+ * We assume the caller intended to SET migrate type to isolate.
+ * If it is already set, then someone else must have raced and
+ * set it before us.
+ */
+ if (is_migrate_isolate_page(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
+
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+ if (!unmovable) {
+ unsigned long nr_pages;
+ int mt = get_pageblock_migratetype(page);
+
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ zone->nr_isolate_pageblock++;
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+ NULL);
+
+ __mod_zone_freepage_state(zone, -nr_pages, mt);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ drain_all_pages(zone);
+ return 0;
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (isol_flags & REPORT_FAILURE) {
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
+ }
+
+ return -EBUSY;
+}
+
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+ struct zone *zone;
+ unsigned long flags, nr_pages;
+ bool isolated_page = false;
+ unsigned int order;
+ unsigned long pfn, buddy_pfn;
+ struct page *buddy;
+
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!is_migrate_isolate_page(page))
+ goto out;
+
+ /*
+ * Because freepage with more than pageblock_order on isolated
+ * pageblock is restricted to merge due to freepage counting problem,
+ * it is possible that there is free buddy page.
+ * move_freepages_block() doesn't care of merge so we need other
+ * approach in order to merge them. Isolation and free will make
+ * these pages to be merged.
+ */
+ if (PageBuddy(page)) {
+ order = buddy_order(page);
+ if (order >= pageblock_order) {
+ pfn = page_to_pfn(page);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (pfn_valid_within(buddy_pfn) &&
+ !is_migrate_isolate_page(buddy)) {
+ __isolate_free_page(page, order);
+ isolated_page = true;
+ }
+ }
+ }
+
+ /*
+ * If we isolate freepage with more than pageblock_order, there
+ * should be no freepage in the range, so we could avoid costly
+ * pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
+ */
+ if (!isolated_page) {
+ nr_pages = move_freepages_block(zone, page, migratetype, NULL);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
+ }
+ set_pageblock_migratetype(page, migratetype);
+ if (isolated_page)
+ __putback_isolated_page(page, order, migratetype);
+ zone->nr_isolate_pageblock--;
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+
+ page = pfn_to_online_page(pfn + i);
+ if (!page)
+ continue;
+ return page;
+ }
+ return NULL;
+}
+
+/**
+ * start_isolate_page_range() - make page-allocation-type of range of pages to
+ * be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * @migratetype: Migrate type to set in error recovery.
+ * @flags: The following flags are allowed (they can be combined in
+ * a bit mask)
+ * MEMORY_OFFLINE - isolate to offline (!allocate) memory
+ * e.g., skip over PageHWPoison() pages
+ * and PageOffline() pages.
+ * REPORT_FAILURE - report details about the failure to
+ * isolate the range
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again. If specified range includes migrate types
+ * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all
+ * pages in the range finally, the caller have to free all pages in the range.
+ * test_page_isolated() can be used for test it.
+ *
+ * There is no high level synchronization mechanism that prevents two threads
+ * from trying to isolate overlapping ranges. If this happens, one thread
+ * will notice pageblocks in the overlapping range already set to isolate.
+ * This happens in set_migratetype_isolate, and set_migratetype_isolate
+ * returns an error. We then clean up by restoring the migration type on
+ * pageblocks we may have modified and return -EBUSY to caller. This
+ * prevents two threads from simultaneously working on overlapping ranges.
+ *
+ * Please note that there is no strong synchronization with the page allocator
+ * either. Pages might be freed while their page blocks are marked ISOLATED.
+ * In some cases pages might still end up on pcp lists and that would allow
+ * for their allocation even when they are in fact isolated already. Depending
+ * on how strong of a guarantee the caller needs drain_all_pages might be needed
+ * (e.g. __offline_pages will need to call it after check for isolated range for
+ * a next retry).
+ *
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype, int flags)
+{
+ unsigned long pfn;
+ unsigned long undo_pfn;
+ struct page *page;
+
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page) {
+ if (set_migratetype_isolate(page, migratetype, flags)) {
+ undo_pfn = pfn;
+ goto undo;
+ }
+ }
+ }
+ return 0;
+undo:
+ for (pfn = start_pfn;
+ pfn < undo_pfn;
+ pfn += pageblock_nr_pages) {
+ struct page *page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Make isolated pages available again.
+ */
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype)
+{
+ unsigned long pfn;
+ struct page *page;
+
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (!page || !is_migrate_isolate_page(page))
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns the last tested pfn.
+ */
+static unsigned long
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+ int flags)
+{
+ struct page *page;
+
+ while (pfn < end_pfn) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page))
+ /*
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
+ */
+ pfn += 1 << buddy_order(page);
+ else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
+ pfn++;
+ else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
+ !page_count(page))
+ /*
+ * The responsible driver agreed to skip PageOffline()
+ * pages when offlining memory by dropping its
+ * reference in MEM_GOING_OFFLINE.
+ */
+ pfn++;
+ else
+ break;
+ }
+
+ return pfn;
+}
+
+/* Caller should ensure that requested range is in a single zone */
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+ int isol_flags)
+{
+ unsigned long pfn, flags;
+ struct page *page;
+ struct zone *zone;
+
+ /*
+ * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
+ * are not aligned to pageblock_nr_pages.
+ * Then we just check migratetype first.
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && !is_migrate_isolate_page(page))
+ break;
+ }
+ page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+ if ((pfn < end_pfn) || !page)
+ return -EBUSY;
+ /* Check all pages are free or marked as ISOLATED */
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ trace_test_pages_isolated(start_pfn, end_pfn, pfn);
+
+ return pfn < end_pfn ? -EBUSY : 0;
+}
diff --git a/mm/page_owner.c b/mm/page_owner.c
new file mode 100644
index 000000000..b735a8eaf
--- /dev/null
+++ b/mm/page_owner.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/memblock.h>
+#include <linux/stacktrace.h>
+#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
+#include <linux/stackdepot.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+/*
+ * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
+ * to use off stack temporal storage
+ */
+#define PAGE_OWNER_STACK_DEPTH (16)
+
+struct page_owner {
+ unsigned short order;
+ short last_migrate_reason;
+ gfp_t gfp_mask;
+ depot_stack_handle_t handle;
+ depot_stack_handle_t free_handle;
+};
+
+static bool page_owner_enabled = false;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
+
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
+static depot_stack_handle_t early_handle;
+
+static void init_early_allocated_pages(void);
+
+static int __init early_page_owner_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ page_owner_enabled = true;
+
+ return 0;
+}
+early_param("page_owner", early_page_owner_param);
+
+static bool need_page_owner(void)
+{
+ return page_owner_enabled;
+}
+
+static __always_inline depot_stack_handle_t create_dummy_stack(void)
+{
+ unsigned long entries[4];
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ return stack_depot_save(entries, nr_entries, GFP_KERNEL);
+}
+
+static noinline void register_dummy_stack(void)
+{
+ dummy_handle = create_dummy_stack();
+}
+
+static noinline void register_failure_stack(void)
+{
+ failure_handle = create_dummy_stack();
+}
+
+static noinline void register_early_stack(void)
+{
+ early_handle = create_dummy_stack();
+}
+
+static void init_page_owner(void)
+{
+ if (!page_owner_enabled)
+ return;
+
+ register_dummy_stack();
+ register_failure_stack();
+ register_early_stack();
+ static_branch_enable(&page_owner_inited);
+ init_early_allocated_pages();
+}
+
+struct page_ext_operations page_owner_ops = {
+ .size = sizeof(struct page_owner),
+ .need = need_page_owner,
+ .init = init_page_owner,
+};
+
+static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
+{
+ return (void *)page_ext + page_owner_ops.offset;
+}
+
+static inline bool check_recursive_alloc(unsigned long *entries,
+ unsigned int nr_entries,
+ unsigned long ip)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (entries[i] == ip)
+ return true;
+ }
+ return false;
+}
+
+static noinline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
+ depot_stack_handle_t handle;
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
+
+ /*
+ * We need to check recursion here because our request to
+ * stackdepot could trigger memory allocation to save new
+ * entry. New memory allocation would reach here and call
+ * stack_depot_save_entries() again if we don't catch it. There is
+ * still not enough memory in stackdepot so it would try to
+ * allocate memory again and loop forever.
+ */
+ if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+ return dummy_handle;
+
+ handle = stack_depot_save(entries, nr_entries, flags);
+ if (!handle)
+ handle = failure_handle;
+
+ return handle;
+}
+
+void __reset_page_owner(struct page *page, unsigned int order)
+{
+ int i;
+ struct page_ext *page_ext;
+ depot_stack_handle_t handle = 0;
+ struct page_owner *page_owner;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+ for (i = 0; i < (1 << order); i++) {
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+ struct page_ext *page_ext, depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
+{
+ struct page_owner *page_owner;
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ depot_stack_handle_t handle;
+
+ if (unlikely(!page_ext))
+ return;
+
+ handle = save_stack(gfp_mask);
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
+}
+
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
+
+ if (unlikely(!page_ext))
+ return;
+
+ page_owner = get_page_owner(page_ext);
+ page_owner->last_migrate_reason = reason;
+}
+
+void __split_page_owner(struct page *page, unsigned int nr)
+{
+ int i;
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
+
+ if (unlikely(!page_ext))
+ return;
+
+ for (i = 0; i < nr; i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ struct page_owner *old_page_owner, *new_page_owner;
+
+ if (unlikely(!old_ext || !new_ext))
+ return;
+
+ old_page_owner = get_page_owner(old_ext);
+ new_page_owner = get_page_owner(new_ext);
+ new_page_owner->order = old_page_owner->order;
+ new_page_owner->gfp_mask = old_page_owner->gfp_mask;
+ new_page_owner->last_migrate_reason =
+ old_page_owner->last_migrate_reason;
+ new_page_owner->handle = old_page_owner->handle;
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+}
+
+void pagetypeinfo_showmixedcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ struct page *page;
+ struct page_ext *page_ext;
+ struct page_owner *page_owner;
+ unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
+ unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+ int pageblock_mt, page_mt;
+ int i;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = zone->zone_start_pfn;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ pageblock_mt = get_pageblock_migratetype(page);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ /* The pageblock is online, no need to recheck. */
+ page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (PageBuddy(page)) {
+ unsigned long freepage_order;
+
+ freepage_order = buddy_order_unsafe(page);
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ continue;
+
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ continue;
+
+ page_owner = get_page_owner(page_ext);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
+ if (pageblock_mt != page_mt) {
+ if (is_migrate_cma(pageblock_mt))
+ count[MIGRATE_MOVABLE]++;
+ else
+ count[pageblock_mt]++;
+
+ pfn = block_end_pfn;
+ break;
+ }
+ pfn += (1UL << page_owner->order) - 1;
+ }
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (i = 0; i < MIGRATE_TYPES; i++)
+ seq_printf(m, "%12lu ", count[i]);
+ seq_putc(m, '\n');
+}
+
+static ssize_t
+print_page_owner(char __user *buf, size_t count, unsigned long pfn,
+ struct page *page, struct page_owner *page_owner,
+ depot_stack_handle_t handle)
+{
+ int ret, pageblock_mt, page_mt;
+ unsigned long *entries;
+ unsigned int nr_entries;
+ char *kbuf;
+
+ count = min_t(size_t, count, PAGE_SIZE);
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = snprintf(kbuf, count,
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_owner->order, page_owner->gfp_mask,
+ &page_owner->gfp_mask);
+
+ if (ret >= count)
+ goto err;
+
+ /* Print information relevant to grouping pages by mobility */
+ pageblock_mt = get_pageblock_migratetype(page);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
+ ret += snprintf(kbuf + ret, count - ret,
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
+ pfn,
+ migratetype_names[page_mt],
+ pfn >> pageblock_order,
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
+
+ if (ret >= count)
+ goto err;
+
+ nr_entries = stack_depot_fetch(handle, &entries);
+ ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
+ if (ret >= count)
+ goto err;
+
+ if (page_owner->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_owner->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
+ ret += snprintf(kbuf + ret, count - ret, "\n");
+ if (ret >= count)
+ goto err;
+
+ if (copy_to_user(buf, kbuf, ret))
+ ret = -EFAULT;
+
+ kfree(kbuf);
+ return ret;
+
+err:
+ kfree(kbuf);
+ return -ENOMEM;
+}
+
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_owner *page_owner;
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
+ gfp_t gfp_mask;
+ int mt;
+
+ if (unlikely(!page_ext)) {
+ pr_alert("There is not page extension available.\n");
+ return;
+ }
+
+ page_owner = get_page_owner(page_ext);
+ gfp_mask = page_owner->gfp_mask;
+ mt = gfp_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not present (never set?)\n");
+ return;
+ }
+
+ if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ pr_alert("page_owner tracks the page as allocated\n");
+ else
+ pr_alert("page_owner tracks the page as freed\n");
+
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
+ handle = READ_ONCE(page_owner->handle);
+ if (!handle) {
+ pr_alert("page_owner allocation stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+ }
+
+ handle = READ_ONCE(page_owner->free_handle);
+ if (!handle) {
+ pr_alert("page_owner free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ pr_alert("page last free stack trace:\n");
+ stack_trace_print(entries, nr_entries, 0);
+ }
+
+ if (page_owner->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_owner->last_migrate_reason]);
+}
+
+static ssize_t
+read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned long pfn;
+ struct page *page;
+ struct page_ext *page_ext;
+ struct page_owner *page_owner;
+ depot_stack_handle_t handle;
+
+ if (!static_branch_unlikely(&page_owner_inited))
+ return -EINVAL;
+
+ page = NULL;
+ pfn = min_low_pfn + *ppos;
+
+ /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
+ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
+ pfn++;
+
+ drain_all_pages(NULL);
+
+ /* Find an allocated page */
+ for (; pfn < max_pfn; pfn++) {
+ /*
+ * If the new page is in a new MAX_ORDER_NR_PAGES area,
+ * validate the area as existing, skip it if not
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
+ pfn += MAX_ORDER_NR_PAGES - 1;
+ continue;
+ }
+
+ /* Check for holes within a MAX_ORDER area */
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = buddy_order_unsafe(page);
+
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
+ continue;
+ }
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ continue;
+
+ /*
+ * Some pages could be missed by concurrent allocation or free,
+ * because we don't hold the zone lock.
+ */
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /*
+ * Although we do have the info about past allocation of free
+ * pages, it's not relevant for current memory usage.
+ */
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ continue;
+
+ page_owner = get_page_owner(page_ext);
+
+ /*
+ * Don't print "tail" pages of high-order allocations as that
+ * would inflate the stats.
+ */
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+ continue;
+
+ /*
+ * Access to page_ext->handle isn't synchronous so we should
+ * be careful to access it.
+ */
+ handle = READ_ONCE(page_owner->handle);
+ if (!handle)
+ continue;
+
+ /* Record the next PFN to read in the file offset */
+ *ppos = (pfn - min_low_pfn) + 1;
+
+ return print_page_owner(buf, count, pfn, page,
+ page_owner, handle);
+ }
+
+ return 0;
+}
+
+static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned long pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long count = 0;
+
+ /*
+ * Walk the zone in pageblock_nr_pages steps. If a page block spans
+ * a zone boundary, it will be double counted between zones. This does
+ * not matter as the mixed block count will still be correct
+ */
+ for (; pfn < end_pfn; ) {
+ unsigned long block_end_pfn;
+
+ if (!pfn_valid(pfn)) {
+ pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+ continue;
+ }
+
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ for (; pfn < block_end_pfn; pfn++) {
+ struct page *page;
+ struct page_ext *page_ext;
+
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
+ /*
+ * To avoid having to grab zone->lock, be a little
+ * careful when reading buddy page order. The only
+ * danger is that we skip too much and potentially miss
+ * some early allocated pages, which is better than
+ * heavy lock contention.
+ */
+ if (PageBuddy(page)) {
+ unsigned long order = buddy_order_unsafe(page);
+
+ if (order > 0 && order < MAX_ORDER)
+ pfn += (1UL << order) - 1;
+ continue;
+ }
+
+ if (PageReserved(page))
+ continue;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ continue;
+
+ /* Maybe overlapping zone */
+ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ continue;
+
+ /* Found early allocated page */
+ __set_page_owner_handle(page, page_ext, early_handle,
+ 0, 0);
+ count++;
+ }
+ cond_resched();
+ }
+
+ pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
+ pgdat->node_id, zone->name, count);
+}
+
+static void init_zones_in_node(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ init_pages_in_zone(pgdat, zone);
+ }
+}
+
+static void init_early_allocated_pages(void)
+{
+ pg_data_t *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ init_zones_in_node(pgdat);
+}
+
+static const struct file_operations proc_page_owner_operations = {
+ .read = read_page_owner,
+};
+
+static int __init pageowner_init(void)
+{
+ if (!static_branch_unlikely(&page_owner_inited)) {
+ pr_info("page_owner is disabled\n");
+ return 0;
+ }
+
+ debugfs_create_file("page_owner", 0400, NULL, NULL,
+ &proc_page_owner_operations);
+
+ return 0;
+}
+late_initcall(pageowner_init)
diff --git a/mm/page_poison.c b/mm/page_poison.c
new file mode 100644
index 000000000..ae0482cde
--- /dev/null
+++ b/mm/page_poison.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page_ext.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+#include <linux/kasan.h>
+
+static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
+
+static int __init early_page_poison_param(char *buf)
+{
+ int ret;
+ bool tmp;
+
+ ret = strtobool(buf, &tmp);
+ if (ret)
+ return ret;
+
+ if (tmp)
+ static_branch_enable(&want_page_poisoning);
+ else
+ static_branch_disable(&want_page_poisoning);
+
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+/**
+ * page_poisoning_enabled - check if page poisoning is enabled
+ *
+ * Return true if page poisoning is enabled, or false if not.
+ */
+bool page_poisoning_enabled(void)
+{
+ /*
+ * Assumes that debug_pagealloc_enabled is set before
+ * memblock_free_all.
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ return (static_branch_unlikely(&want_page_poisoning) ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled()));
+}
+EXPORT_SYMBOL_GPL(page_poisoning_enabled);
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
+ kunmap_atomic(addr);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ pr_err("pagealloc: single bit error\n");
+ else
+ pr_err("pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ addr = kmap_atomic(page);
+ /*
+ * Page poisoning when enabled poisons each and every page
+ * that is freed to buddy. Thus no extra check is done to
+ * see if a page was poisoned.
+ */
+ check_poison_mem(addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_poison_pages(struct page *page, int numpages, int enable)
+{
+ if (!page_poisoning_enabled())
+ return;
+
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
new file mode 100644
index 000000000..cd8e13d41
--- /dev/null
+++ b/mm/page_reporting.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/page_reporting.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+
+#include "page_reporting.h"
+#include "internal.h"
+
+#define PAGE_REPORTING_DELAY (2 * HZ)
+static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
+
+enum {
+ PAGE_REPORTING_IDLE = 0,
+ PAGE_REPORTING_REQUESTED,
+ PAGE_REPORTING_ACTIVE
+};
+
+/* request page reporting */
+static void
+__page_reporting_request(struct page_reporting_dev_info *prdev)
+{
+ unsigned int state;
+
+ /* Check to see if we are in desired state */
+ state = atomic_read(&prdev->state);
+ if (state == PAGE_REPORTING_REQUESTED)
+ return;
+
+ /*
+ * If reporting is already active there is nothing we need to do.
+ * Test against 0 as that represents PAGE_REPORTING_IDLE.
+ */
+ state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
+ if (state != PAGE_REPORTING_IDLE)
+ return;
+
+ /*
+ * Delay the start of work to allow a sizable queue to build. For
+ * now we are limiting this to running no more than once every
+ * couple of seconds.
+ */
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+/* notify prdev of free page reporting request */
+void __page_reporting_notify(void)
+{
+ struct page_reporting_dev_info *prdev;
+
+ /*
+ * We use RCU to protect the pr_dev_info pointer. In almost all
+ * cases this should be present, however in the unlikely case of
+ * a shutdown this will be NULL and we should exit.
+ */
+ rcu_read_lock();
+ prdev = rcu_dereference(pr_dev_info);
+ if (likely(prdev))
+ __page_reporting_request(prdev);
+
+ rcu_read_unlock();
+}
+
+static void
+page_reporting_drain(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, unsigned int nents, bool reported)
+{
+ struct scatterlist *sg = sgl;
+
+ /*
+ * Drain the now reported pages back into their respective
+ * free lists/areas. We assume at least one page is populated.
+ */
+ do {
+ struct page *page = sg_page(sg);
+ int mt = get_pageblock_migratetype(page);
+ unsigned int order = get_order(sg->length);
+
+ __putback_isolated_page(page, order, mt);
+
+ /* If the pages were not reported due to error skip flagging */
+ if (!reported)
+ continue;
+
+ /*
+ * If page was not comingled with another page we can
+ * consider the result to be "reported" since the page
+ * hasn't been modified, otherwise we will need to
+ * report on the new larger page when we make our way
+ * up to that higher order.
+ */
+ if (PageBuddy(page) && buddy_order(page) == order)
+ __SetPageReported(page);
+ } while ((sg = sg_next(sg)));
+
+ /* reinitialize scatterlist now that it is empty */
+ sg_init_table(sgl, nents);
+}
+
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ */
+static int
+page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
+ unsigned int order, unsigned int mt,
+ struct scatterlist *sgl, unsigned int *offset)
+{
+ struct free_area *area = &zone->free_area[order];
+ struct list_head *list = &area->free_list[mt];
+ unsigned int page_len = PAGE_SIZE << order;
+ struct page *page, *next;
+ long budget;
+ int err = 0;
+
+ /*
+ * Perform early check, if free area is empty there is
+ * nothing to process so we can skip this free_list.
+ */
+ if (list_empty(list))
+ return err;
+
+ spin_lock_irq(&zone->lock);
+
+ /*
+ * Limit how many calls we will be making to the page reporting
+ * device for this list. By doing this we avoid processing any
+ * given list for too long.
+ *
+ * The current value used allows us enough calls to process over a
+ * sixteenth of the current list plus one additional call to handle
+ * any pages that may have already been present from the previous
+ * list processed. This should result in us reporting all pages on
+ * an idle system in about 30 seconds.
+ *
+ * The division here should be cheap since PAGE_REPORTING_CAPACITY
+ * should always be a power of 2.
+ */
+ budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+
+ /* loop through free list adding unreported pages to sg list */
+ list_for_each_entry_safe(page, next, list, lru) {
+ /* We are going to skip over the reported pages. */
+ if (PageReported(page))
+ continue;
+
+ /*
+ * If we fully consumed our budget then update our
+ * state to indicate that we are requesting additional
+ * processing and exit this list.
+ */
+ if (budget < 0) {
+ atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
+ next = page;
+ break;
+ }
+
+ /* Attempt to pull page from list and place in scatterlist */
+ if (*offset) {
+ if (!__isolate_free_page(page, order)) {
+ next = page;
+ break;
+ }
+
+ /* Add page to scatter list */
+ --(*offset);
+ sg_set_page(&sgl[*offset], page, page_len, 0);
+
+ continue;
+ }
+
+ /*
+ * Make the first non-reported page in the free list
+ * the new head of the free list before we release the
+ * zone lock.
+ */
+ if (!list_is_first(&page->lru, list))
+ list_rotate_to_front(&page->lru, list);
+
+ /* release lock before waiting on report processing */
+ spin_unlock_irq(&zone->lock);
+
+ /* begin processing pages in local list */
+ err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
+
+ /* reset offset since the full list was reported */
+ *offset = PAGE_REPORTING_CAPACITY;
+
+ /* update budget to reflect call to report function */
+ budget--;
+
+ /* reacquire zone lock and resume processing */
+ spin_lock_irq(&zone->lock);
+
+ /* flush reported pages from the sg list */
+ page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
+
+ /*
+ * Reset next to first entry, the old next isn't valid
+ * since we dropped the lock to report the pages
+ */
+ next = list_first_entry(list, struct page, lru);
+
+ /* exit on error */
+ if (err)
+ break;
+ }
+
+ /* Rotate any leftover pages to the head of the freelist */
+ if (&next->lru != list && !list_is_first(&next->lru, list))
+ list_rotate_to_front(&next->lru, list);
+
+ spin_unlock_irq(&zone->lock);
+
+ return err;
+}
+
+static int
+page_reporting_process_zone(struct page_reporting_dev_info *prdev,
+ struct scatterlist *sgl, struct zone *zone)
+{
+ unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+ unsigned long watermark;
+ int err = 0;
+
+ /* Generate minimum watermark to be able to guarantee progress */
+ watermark = low_wmark_pages(zone) +
+ (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+
+ /*
+ * Cancel request if insufficient free memory or if we failed
+ * to allocate page reporting statistics for the zone.
+ */
+ if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+ return err;
+
+ /* Process each free list starting from lowest order/mt */
+ for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+ for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+ /* We do not pull pages from the isolate free list */
+ if (is_migrate_isolate(mt))
+ continue;
+
+ err = page_reporting_cycle(prdev, zone, order, mt,
+ sgl, &offset);
+ if (err)
+ return err;
+ }
+ }
+
+ /* report the leftover pages before going idle */
+ leftover = PAGE_REPORTING_CAPACITY - offset;
+ if (leftover) {
+ sgl = &sgl[offset];
+ err = prdev->report(prdev, sgl, leftover);
+
+ /* flush any remaining pages out from the last report */
+ spin_lock_irq(&zone->lock);
+ page_reporting_drain(prdev, sgl, leftover, !err);
+ spin_unlock_irq(&zone->lock);
+ }
+
+ return err;
+}
+
+static void page_reporting_process(struct work_struct *work)
+{
+ struct delayed_work *d_work = to_delayed_work(work);
+ struct page_reporting_dev_info *prdev =
+ container_of(d_work, struct page_reporting_dev_info, work);
+ int err = 0, state = PAGE_REPORTING_ACTIVE;
+ struct scatterlist *sgl;
+ struct zone *zone;
+
+ /*
+ * Change the state to "Active" so that we can track if there is
+ * anyone requests page reporting after we complete our pass. If
+ * the state is not altered by the end of the pass we will switch
+ * to idle and quit scheduling reporting runs.
+ */
+ atomic_set(&prdev->state, state);
+
+ /* allocate scatterlist to store pages being reported on */
+ sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
+ if (!sgl)
+ goto err_out;
+
+ sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
+
+ for_each_zone(zone) {
+ err = page_reporting_process_zone(prdev, sgl, zone);
+ if (err)
+ break;
+ }
+
+ kfree(sgl);
+err_out:
+ /*
+ * If the state has reverted back to requested then there may be
+ * additional pages to be processed. We will defer for 2s to allow
+ * more pages to accumulate.
+ */
+ state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
+ if (state == PAGE_REPORTING_REQUESTED)
+ schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+static DEFINE_MUTEX(page_reporting_mutex);
+DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
+
+int page_reporting_register(struct page_reporting_dev_info *prdev)
+{
+ int err = 0;
+
+ mutex_lock(&page_reporting_mutex);
+
+ /* nothing to do if already in use */
+ if (rcu_access_pointer(pr_dev_info)) {
+ err = -EBUSY;
+ goto err_out;
+ }
+
+ /* initialize state and work structures */
+ atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
+ INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
+
+ /* Begin initial flush of zones */
+ __page_reporting_request(prdev);
+
+ /* Assign device to allow notifications */
+ rcu_assign_pointer(pr_dev_info, prdev);
+
+ /* enable page reporting notification */
+ if (!static_key_enabled(&page_reporting_enabled)) {
+ static_branch_enable(&page_reporting_enabled);
+ pr_info("Free page reporting enabled\n");
+ }
+err_out:
+ mutex_unlock(&page_reporting_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(page_reporting_register);
+
+void page_reporting_unregister(struct page_reporting_dev_info *prdev)
+{
+ mutex_lock(&page_reporting_mutex);
+
+ if (rcu_access_pointer(pr_dev_info) == prdev) {
+ /* Disable page reporting notification */
+ RCU_INIT_POINTER(pr_dev_info, NULL);
+ synchronize_rcu();
+
+ /* Flush any existing work, and lock it out */
+ cancel_delayed_work_sync(&prdev->work);
+ }
+
+ mutex_unlock(&page_reporting_mutex);
+}
+EXPORT_SYMBOL_GPL(page_reporting_unregister);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
new file mode 100644
index 000000000..2c385dd4d
--- /dev/null
+++ b/mm/page_reporting.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_PAGE_REPORTING_H
+#define _MM_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/pageblock-flags.h>
+#include <linux/page-isolation.h>
+#include <linux/jump_label.h>
+#include <linux/slab.h>
+#include <linux/pgtable.h>
+#include <linux/scatterlist.h>
+
+#define PAGE_REPORTING_MIN_ORDER pageblock_order
+
+#ifdef CONFIG_PAGE_REPORTING
+DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+void __page_reporting_notify(void);
+
+static inline bool page_reported(struct page *page)
+{
+ return static_branch_unlikely(&page_reporting_enabled) &&
+ PageReported(page);
+}
+
+/**
+ * page_reporting_notify_free - Free page notification to start page processing
+ *
+ * This function is meant to act as a screener for __page_reporting_notify
+ * which will determine if a give zone has crossed over the high-water mark
+ * that will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch list for treatment.
+ */
+static inline void page_reporting_notify_free(unsigned int order)
+{
+ /* Called from hot path in __free_one_page() */
+ if (!static_branch_unlikely(&page_reporting_enabled))
+ return;
+
+ /* Determine if we have crossed reporting threshold */
+ if (order < PAGE_REPORTING_MIN_ORDER)
+ return;
+
+ /* This will add a few cycles, but should be called infrequently */
+ __page_reporting_notify();
+}
+#else /* CONFIG_PAGE_REPORTING */
+#define page_reported(_page) false
+
+static inline void page_reporting_notify_free(unsigned int order)
+{
+}
+#endif /* CONFIG_PAGE_REPORTING */
+#endif /*_MM_PAGE_REPORTING_H */
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
new file mode 100644
index 000000000..610ebbee7
--- /dev/null
+++ b/mm/page_vma_mapped.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static inline bool not_found(struct page_vma_mapped_walk *pvmw)
+{
+ page_vma_mapped_walk_done(pvmw);
+ return false;
+}
+
+static bool map_pte(struct page_vma_mapped_walk *pvmw)
+{
+ pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
+ if (!(pvmw->flags & PVMW_SYNC)) {
+ if (pvmw->flags & PVMW_MIGRATION) {
+ if (!is_swap_pte(*pvmw->pte))
+ return false;
+ } else {
+ /*
+ * We get here when we are trying to unmap a private
+ * device page from the process address space. Such
+ * page is not CPU accessible and thus is mapped as
+ * a special swap entry, nonetheless it still does
+ * count as a valid regular mapping for the page (and
+ * is accounted as such in page maps count).
+ *
+ * So handle this special case as if it was a normal
+ * page mapping ie lock CPU page table and returns
+ * true.
+ *
+ * For more details on device private memory see HMM
+ * (include/linux/hmm.h or mm/hmm.c).
+ */
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_device_private_entry(entry))
+ return false;
+ } else if (!pte_present(*pvmw->pte))
+ return false;
+ }
+ }
+ pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
+ return true;
+}
+
+static inline bool pfn_is_match(struct page *page, unsigned long pfn)
+{
+ unsigned long page_pfn = page_to_pfn(page);
+
+ /* normal page and hugetlbfs page */
+ if (!PageTransCompound(page) || PageHuge(page))
+ return page_pfn == pfn;
+
+ /* THP can be referenced by any subpage */
+ return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page);
+}
+
+/**
+ * check_pte - check if @pvmw->page is mapped at the @pvmw->pte
+ *
+ * page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
+ * mapped. check_pte() has to validate this.
+ *
+ * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary
+ * page.
+ *
+ * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
+ * entry that points to @pvmw->page or any subpage in case of THP.
+ *
+ * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to
+ * @pvmw->page or any subpage in case of THP.
+ *
+ * Otherwise, return false.
+ *
+ */
+static bool check_pte(struct page_vma_mapped_walk *pvmw)
+{
+ unsigned long pfn;
+
+ if (pvmw->flags & PVMW_MIGRATION) {
+ swp_entry_t entry;
+ if (!is_swap_pte(*pvmw->pte))
+ return false;
+ entry = pte_to_swp_entry(*pvmw->pte);
+
+ if (!is_migration_entry(entry))
+ return false;
+
+ pfn = migration_entry_to_pfn(entry);
+ } else if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_device_private_entry(entry))
+ return false;
+
+ pfn = device_private_entry_to_pfn(entry);
+ } else {
+ if (!pte_present(*pvmw->pte))
+ return false;
+
+ pfn = pte_pfn(*pvmw->pte);
+ }
+
+ return pfn_is_match(pvmw->page, pfn);
+}
+
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
+}
+
+/**
+ * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * @pvmw->address
+ * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
+ * must be set. pmd, pte and ptl must be NULL.
+ *
+ * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
+ * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
+ * adjusted if needed (for PTE-mapped THPs).
+ *
+ * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
+ * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
+ * a loop to find all PTEs that map the THP.
+ *
+ * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry
+ * regardless of which page table level the page is mapped at. @pvmw->pmd is
+ * NULL.
+ *
+ * Retruns false if there are no more page table entries for the page in
+ * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
+ *
+ * If you need to stop the walk before page_vma_mapped_walk() returned false,
+ * use page_vma_mapped_walk_done(). It will do the housekeeping.
+ */
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+{
+ struct mm_struct *mm = pvmw->vma->vm_mm;
+ struct page *page = pvmw->page;
+ unsigned long end;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t pmde;
+
+ /* The only possible pmd mapping has been handled on last iteration */
+ if (pvmw->pmd && !pvmw->pte)
+ return not_found(pvmw);
+
+ if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
+
+ /* when pud is not present, pte will be NULL */
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
+ if (!pvmw->pte)
+ return false;
+
+ pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ spin_lock(pvmw->ptl);
+ if (!check_pte(pvmw))
+ return not_found(pvmw);
+ return true;
+ }
+
+ /*
+ * Seek to next pte only makes sense for THP.
+ * But more important than that optimization, is to filter out
+ * any PageKsm page: whose page->index misleads vma_address()
+ * and vma_address_end() to disaster.
+ */
+ end = PageTransCompound(page) ?
+ vma_address_end(page, pvmw->vma) :
+ pvmw->address + PAGE_SIZE;
+ if (pvmw->pte)
+ goto next_pte;
+restart:
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
+
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (likely(pmd_trans_huge(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+ if (pmd_page(pmde) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
+
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ /* THP pmd was split under us: handle on pte level */
+ spin_unlock(pvmw->ptl);
+ pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
+ }
+ if (!map_pte(pvmw))
+ goto next_pte;
+this_pte:
+ if (check_pte(pvmw))
+ return true;
+next_pte:
+ do {
+ pvmw->address += PAGE_SIZE;
+ if (pvmw->address >= end)
+ return not_found(pvmw);
+ /* Did we cross page table boundary? */
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
+ if (pvmw->ptl) {
+ spin_unlock(pvmw->ptl);
+ pvmw->ptl = NULL;
+ }
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
+ goto restart;
+ }
+ pvmw->pte++;
+ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
+ }
+ } while (pte_none(*pvmw->pte));
+
+ if (!pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
+ }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
+}
+
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA. Only
+ * valid for normal file or anonymous VMAs.
+ */
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .flags = PVMW_SYNC,
+ };
+
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
+ return 0;
+ if (!page_vma_mapped_walk(&pvmw))
+ return 0;
+ page_vma_mapped_walk_done(&pvmw);
+ return 1;
+}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 000000000..371ec21a1
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+
+/*
+ * We want to know the real level where a entry is located ignoring any
+ * folding of levels which may be happening. For example if p4d is folded then
+ * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
+ */
+static int real_depth(int depth)
+{
+ if (depth == 3 && PTRS_PER_PMD == 1)
+ depth = 2;
+ if (depth == 2 && PTRS_PER_PUD == 1)
+ depth = 1;
+ if (depth == 1 && PTRS_PER_P4D == 1)
+ depth = 0;
+ return depth;
+}
+
+static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+
+ for (;;) {
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ if (err)
+ break;
+ if (addr >= end - PAGE_SIZE)
+ break;
+ addr += PAGE_SIZE;
+ pte++;
+ }
+ return err;
+}
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+ spinlock_t *ptl;
+
+ if (walk->no_vma) {
+ pte = pte_offset_map(pmd, addr);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap(pte);
+ } else {
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
+
+ return err;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+ int depth = real_depth(3);
+
+ pmd = pmd_offset(pud, addr);
+ do {
+again:
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*pmd)) {
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, depth, walk);
+ if (err)
+ break;
+ continue;
+ }
+
+ walk->action = ACTION_SUBTREE;
+
+ /*
+ * This implies that each ->pmd_entry() handler
+ * needs to know about pmd_trans_huge() pmds
+ */
+ if (ops->pmd_entry)
+ err = ops->pmd_entry(pmd, addr, next, walk);
+ if (err)
+ break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
+ /*
+ * Check this here so we only break down trans_huge
+ * pages when we _need_ to
+ */
+ if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pte_entry))
+ continue;
+
+ if (walk->vma) {
+ split_huge_pmd(walk->vma, pmd, addr);
+ if (pmd_trans_unstable(pmd))
+ goto again;
+ }
+
+ err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+ int depth = real_depth(2);
+
+ pud = pud_offset(p4d, addr);
+ do {
+ again:
+ next = pud_addr_end(addr, end);
+ if (pud_none(*pud)) {
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, depth, walk);
+ if (err)
+ break;
+ continue;
+ }
+
+ walk->action = ACTION_SUBTREE;
+
+ if (ops->pud_entry)
+ err = ops->pud_entry(pud, addr, next, walk);
+ if (err)
+ break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
+ if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pmd_entry || ops->pte_entry))
+ continue;
+
+ if (walk->vma)
+ split_huge_pud(walk->vma, pud, addr);
+ if (pud_none(*pud))
+ goto again;
+
+ err = walk_pmd_range(pud, addr, next, walk);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+ int depth = real_depth(1);
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d)) {
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, depth, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (ops->p4d_entry) {
+ err = ops->p4d_entry(p4d, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+ err = walk_pud_range(p4d, addr, next, walk);
+ if (err)
+ break;
+ } while (p4d++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+
+ if (walk->pgd)
+ pgd = walk->pgd + pgd_index(addr);
+ else
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, 0, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (ops->pgd_entry) {
+ err = ops->pgd_entry(pgd, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+ ops->pte_entry)
+ err = walk_p4d_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+ return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long next;
+ unsigned long hmask = huge_page_mask(h);
+ unsigned long sz = huge_page_size(h);
+ pte_t *pte;
+ const struct mm_walk_ops *ops = walk->ops;
+ int err = 0;
+
+ do {
+ next = hugetlb_entry_end(h, addr, end);
+ pte = huge_pte_offset(walk->mm, addr & hmask, sz);
+
+ if (pte)
+ err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
+ else if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, -1, walk);
+
+ if (err)
+ break;
+ } while (addr = next, addr != end);
+
+ return err;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;
+
+ if (ops->test_walk)
+ return ops->test_walk(start, end, walk);
+
+ /*
+ * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+ * range, so we don't walk over it as we do for normal vmas. However,
+ * Some callers are interested in handling hole range and they don't
+ * want to just ignore any single address range. Such users certainly
+ * define their ->pte_hole() callbacks, so let's delegate them to handle
+ * vma(VM_PFNMAP).
+ */
+ if (vma->vm_flags & VM_PFNMAP) {
+ int err = 1;
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, end, -1, walk);
+ return err ? err : 1;
+ }
+ return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;
+
+ if (ops->pre_vma) {
+ err = ops->pre_vma(start, end, walk);
+ if (err)
+ return err;
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ if (ops->hugetlb_entry)
+ err = walk_hugetlb_range(start, end, walk);
+ } else
+ err = walk_pgd_range(start, end, walk);
+
+ if (ops->post_vma)
+ ops->post_vma(walk);
+
+ return err;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
+ *
+ * Recursively walk the page table tree of the process represented by @mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
+ * purpose.
+ *
+ * If operations need to be staged before and committed after a vma is walked,
+ * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
+ * since it is intended to handle commit-type operations, can't return any
+ * errors.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @private should be helpful.
+ *
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
+ * because these function traverse vma list and/or access to vma's data.
+ */
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ int err = 0;
+ unsigned long next;
+ struct vm_area_struct *vma;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .private = private,
+ };
+
+ if (start >= end)
+ return -EINVAL;
+
+ if (!walk.mm)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+
+ vma = find_vma(walk.mm, start);
+ do {
+ if (!vma) { /* after the last vma */
+ walk.vma = NULL;
+ next = end;
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
+ } else if (start < vma->vm_start) { /* outside vma */
+ walk.vma = NULL;
+ next = min(end, vma->vm_start);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
+ } else { /* inside vma */
+ walk.vma = vma;
+ next = min(end, vma->vm_end);
+ vma = vma->vm_next;
+
+ err = walk_page_test(start, next, &walk);
+ if (err > 0) {
+ /*
+ * positive return values are purely for
+ * controlling the pagewalk, so should never
+ * be passed to the callers.
+ */
+ err = 0;
+ continue;
+ }
+ if (err < 0)
+ break;
+ err = __walk_page_range(start, next, &walk);
+ }
+ if (err)
+ break;
+ } while (start = next, start < end);
+ return err;
+}
+
+/*
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback. This is useful for
+ * walking the kernel pages tables or page tables for firmware.
+ */
+int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+
+ mmap_assert_write_locked(walk.mm);
+
+ return walk_pgd_range(start, end, &walk);
+}
+
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+ int err;
+
+ if (!walk.mm)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
+ if (err > 0)
+ return 0;
+ if (err < 0)
+ return err;
+ return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
+}
+
+/**
+ * walk_page_mapping - walk all memory areas mapped into a struct address_space.
+ * @mapping: Pointer to the struct address_space
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
+ *
+ * This function walks all memory areas mapped into a struct address_space.
+ * The walk is limited to only the given page-size index range, but if
+ * the index boundaries cross a huge page-table entry, that entry will be
+ * included.
+ *
+ * Also see walk_page_range() for additional information.
+ *
+ * Locking:
+ * This function can't require that the struct mm_struct::mmap_lock is held,
+ * since @mapping may be mapped by multiple processes. Instead
+ * @mapping->i_mmap_rwsem must be held. This might have implications in the
+ * callbacks, and it's up tho the caller to ensure that the
+ * struct mm_struct::mmap_lock is not needed.
+ *
+ * Also this means that a caller can't rely on the struct
+ * vm_area_struct::vm_flags to be constant across a call,
+ * except for immutable flags. Callers requiring this shouldn't use
+ * this function.
+ *
+ * Return: 0 on success, negative error code on failure, positive number on
+ * caller defined premature termination.
+ */
+int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
+ pgoff_t nr, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .private = private,
+ };
+ struct vm_area_struct *vma;
+ pgoff_t vba, vea, cba, cea;
+ unsigned long start_addr, end_addr;
+ int err = 0;
+
+ lockdep_assert_held(&mapping->i_mmap_rwsem);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+ first_index + nr - 1) {
+ /* Clip to the vma */
+ vba = vma->vm_pgoff;
+ vea = vba + vma_pages(vma);
+ cba = first_index;
+ cba = max(cba, vba);
+ cea = first_index + nr;
+ cea = min(cea, vea);
+
+ start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+ end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+ if (start_addr >= end_addr)
+ continue;
+
+ walk.vma = vma;
+ walk.mm = vma->vm_mm;
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
+ if (err > 0) {
+ err = 0;
+ break;
+ } else if (err < 0)
+ break;
+
+ err = __walk_page_range(start_addr, end_addr, &walk);
+ if (err)
+ break;
+ }
+
+ return err;
+}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
new file mode 100644
index 000000000..095d7eaa0
--- /dev/null
+++ b/mm/percpu-internal.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_PERCPU_INTERNAL_H
+#define _MM_PERCPU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/percpu.h>
+
+/*
+ * There are two chunk types: root and memcg-aware.
+ * Chunks of each type have separate slots list.
+ *
+ * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
+ * used to store memcg membership data of a percpu object. Obj_cgroups are
+ * ref-counted pointers to a memory cgroup with an ability to switch dynamically
+ * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
+ * without reclaiming of all outstanding objects, which hold a reference at it.
+ */
+enum pcpu_chunk_type {
+ PCPU_CHUNK_ROOT,
+#ifdef CONFIG_MEMCG_KMEM
+ PCPU_CHUNK_MEMCG,
+#endif
+ PCPU_NR_CHUNK_TYPES,
+ PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
+};
+
+/*
+ * pcpu_block_md is the metadata block struct.
+ * Each chunk's bitmap is split into a number of full blocks.
+ * All units are in terms of bits.
+ *
+ * The scan hint is the largest known contiguous area before the contig hint.
+ * It is not necessarily the actual largest contig hint though. There is an
+ * invariant that the scan_hint_start > contig_hint_start iff
+ * scan_hint == contig_hint. This is necessary because when scanning forward,
+ * we don't know if a new contig hint would be better than the current one.
+ */
+struct pcpu_block_md {
+ int scan_hint; /* scan hint for block */
+ int scan_hint_start; /* block relative starting
+ position of the scan hint */
+ int contig_hint; /* contig hint for block */
+ int contig_hint_start; /* block relative starting
+ position of the contig hint */
+ int left_free; /* size of free space along
+ the left side of the block */
+ int right_free; /* size of free space along
+ the right side of the block */
+ int first_free; /* block position of first free */
+ int nr_bits; /* total bits responsible for */
+};
+
+struct pcpu_chunk {
+#ifdef CONFIG_PERCPU_STATS
+ int nr_alloc; /* # of allocations */
+ size_t max_alloc_size; /* largest allocation size */
+#endif
+
+ struct list_head list; /* linked to pcpu_slot lists */
+ int free_bytes; /* free bytes in the chunk */
+ struct pcpu_block_md chunk_md;
+ void *base_addr; /* base address of this chunk */
+
+ unsigned long *alloc_map; /* allocation map */
+ unsigned long *bound_map; /* boundary map */
+ struct pcpu_block_md *md_blocks; /* metadata blocks */
+
+ void *data; /* chunk data */
+ bool immutable; /* no [de]population allowed */
+ int start_offset; /* the overlap with the previous
+ region to have a page aligned
+ base_addr */
+ int end_offset; /* additional area required to
+ have the region end page
+ aligned */
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
+#endif
+
+ int nr_pages; /* # of pages served by this chunk */
+ int nr_populated; /* # of populated pages */
+ int nr_empty_pop_pages; /* # of empty populated pages */
+ unsigned long populated[]; /* populated bitmap */
+};
+
+extern spinlock_t pcpu_lock;
+
+extern struct list_head *pcpu_chunk_lists;
+extern int pcpu_nr_slots;
+extern int pcpu_nr_empty_pop_pages[];
+
+extern struct pcpu_chunk *pcpu_first_chunk;
+extern struct pcpu_chunk *pcpu_reserved_chunk;
+
+/**
+ * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bitmap blocks used.
+ */
+static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
+{
+ return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
+}
+
+/**
+ * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
+ * @pages: number of physical pages
+ *
+ * This conversion is from physical pages to the number of bits
+ * required in the bitmap.
+ */
+static inline int pcpu_nr_pages_to_map_bits(int pages)
+{
+ return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bits in the bitmap.
+ */
+static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
+{
+ return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ if (chunk->obj_cgroups)
+ return PCPU_CHUNK_MEMCG;
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return chunk_type == PCPU_CHUNK_MEMCG;
+}
+
+#else
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return false;
+}
+#endif
+
+static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
+{
+ return &pcpu_chunk_lists[pcpu_nr_slots *
+ pcpu_is_memcg_chunk(chunk_type)];
+}
+
+#ifdef CONFIG_PERCPU_STATS
+
+#include <linux/spinlock.h>
+
+struct percpu_stats {
+ u64 nr_alloc; /* lifetime # of allocations */
+ u64 nr_dealloc; /* lifetime # of deallocations */
+ u64 nr_cur_alloc; /* current # of allocations */
+ u64 nr_max_alloc; /* max # of live allocations */
+ u32 nr_chunks; /* current # of live chunks */
+ u32 nr_max_chunks; /* max # of live chunks */
+ size_t min_alloc_size; /* min allocaiton size */
+ size_t max_alloc_size; /* max allocation size */
+};
+
+extern struct percpu_stats pcpu_stats;
+extern struct pcpu_alloc_info pcpu_stats_ai;
+
+/*
+ * For debug purposes. We don't care about the flexible array.
+ */
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+ memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
+
+ /* initialize min_alloc_size to unit_size */
+ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
+}
+
+/*
+ * pcpu_stats_area_alloc - increment area allocation stats
+ * @chunk: the location of the area being allocated
+ * @size: size of area to allocate in bytes
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_alloc++;
+ pcpu_stats.nr_cur_alloc++;
+ pcpu_stats.nr_max_alloc =
+ max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
+ pcpu_stats.min_alloc_size =
+ min(pcpu_stats.min_alloc_size, size);
+ pcpu_stats.max_alloc_size =
+ max(pcpu_stats.max_alloc_size, size);
+
+ chunk->nr_alloc++;
+ chunk->max_alloc_size = max(chunk->max_alloc_size, size);
+}
+
+/*
+ * pcpu_stats_area_dealloc - decrement allocation stats
+ * @chunk: the location of the area being deallocated
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ pcpu_stats.nr_dealloc++;
+ pcpu_stats.nr_cur_alloc--;
+
+ chunk->nr_alloc--;
+}
+
+/*
+ * pcpu_stats_chunk_alloc - increment chunk stats
+ */
+static inline void pcpu_stats_chunk_alloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks++;
+ pcpu_stats.nr_max_chunks =
+ max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+/*
+ * pcpu_stats_chunk_dealloc - decrement chunk stats
+ */
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ pcpu_stats.nr_chunks--;
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+#else
+
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+}
+
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+}
+
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+}
+
+static inline void pcpu_stats_chunk_alloc(void)
+{
+}
+
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+}
+
+#endif /* !CONFIG_PERCPU_STATS */
+
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 000000000..35c994107
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation. This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
+ * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
+ * fine.
+ *
+ * - NUMA is not supported. When setting up the first chunk,
+ * @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ * than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ * PAGE_SIZE. Because each chunk is allocated as a contiguous
+ * kernel memory block using alloc_pages(), memory will be wasted if
+ * chunk size is not aligned. percpu-km code will whine about it.
+ */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+
+#include <linux/log2.h>
+
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end, gfp_t gfp)
+{
+ return 0;
+}
+
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ /* nada */
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+ struct pcpu_chunk *chunk;
+ struct page *pages;
+ unsigned long flags;
+ int i;
+
+ chunk = pcpu_alloc_chunk(type, gfp);
+ if (!chunk)
+ return NULL;
+
+ pages = alloc_pages(gfp, order_base_2(nr_pages));
+ if (!pages) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ for (i = 0; i < nr_pages; i++)
+ pcpu_set_page_chunk(nth_page(pages, i), chunk);
+
+ chunk->data = pages;
+ chunk->base_addr = page_address(pages);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_populated(chunk, 0, nr_pages);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
+ __free_pages(chunk->data, order_base_2(nr_pages));
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return virt_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ size_t nr_pages, alloc_pages;
+
+ /* all units must be in a single group */
+ if (ai->nr_groups != 1) {
+ pr_crit("can't handle more than one group\n");
+ return -EINVAL;
+ }
+
+ nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+ alloc_pages = roundup_pow_of_two(nr_pages);
+
+ if (alloc_pages > nr_pages)
+ pr_warn("wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
+
+ return 0;
+}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
new file mode 100644
index 000000000..f6026dbcd
--- /dev/null
+++ b/mm/percpu-stats.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/percpu-debug.c
+ *
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
+ *
+ * Prints statistics about the percpu allocator and backing chunks.
+ */
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/sort.h>
+#include <linux/vmalloc.h>
+
+#include "percpu-internal.h"
+
+#define P(X, Y) \
+ seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
+
+struct percpu_stats pcpu_stats;
+struct pcpu_alloc_info pcpu_stats_ai;
+
+static int cmpint(const void *a, const void *b)
+{
+ return *(int *)a - *(int *)b;
+}
+
+/*
+ * Iterates over all chunks to find the max nr_alloc entries.
+ */
+static int find_max_nr_alloc(void)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_nr_alloc;
+ enum pcpu_chunk_type type;
+
+ max_nr_alloc = 0;
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list)
+ max_nr_alloc = max(max_nr_alloc,
+ chunk->nr_alloc);
+
+ return max_nr_alloc;
+}
+
+/*
+ * Prints out chunk state. Fragmentation is considered between
+ * the beginning of the chunk to the last allocation.
+ *
+ * All statistics are in bytes unless stated otherwise.
+ */
+static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
+ int *buffer)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int i, last_alloc, as_len, start, end;
+ int *alloc_sizes, *p;
+ /* statistics */
+ int sum_frag = 0, max_frag = 0;
+ int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
+
+ alloc_sizes = buffer;
+
+ /*
+ * find_last_bit returns the start value if nothing found.
+ * Therefore, we must determine if it is a failure of find_last_bit
+ * and set the appropriate value.
+ */
+ last_alloc = find_last_bit(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) -
+ chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
+ last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
+ last_alloc + 1 : 0;
+
+ as_len = 0;
+ start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+
+ /*
+ * If a bit is set in the allocation map, the bound_map identifies
+ * where the allocation ends. If the allocation is not set, the
+ * bound_map does not identify free areas as it is only kept accurate
+ * on allocation, not free.
+ *
+ * Positive values are allocations and negative values are free
+ * fragments.
+ */
+ while (start < last_alloc) {
+ if (test_bit(start, chunk->alloc_map)) {
+ end = find_next_bit(chunk->bound_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = 1;
+ } else {
+ end = find_next_bit(chunk->alloc_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = -1;
+ }
+
+ alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
+
+ start = end;
+ }
+
+ /*
+ * The negative values are free fragments and thus sorting gives the
+ * free fragments at the beginning in largest first order.
+ */
+ if (as_len > 0) {
+ sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
+
+ /* iterate through the unallocated fragments */
+ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
+ sum_frag -= *p;
+ max_frag = max(max_frag, -1 * (*p));
+ }
+
+ cur_min_alloc = alloc_sizes[i];
+ cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
+ cur_max_alloc = alloc_sizes[as_len - 1];
+ }
+
+ P("nr_alloc", chunk->nr_alloc);
+ P("max_alloc_size", chunk->max_alloc_size);
+ P("empty_pop_pages", chunk->nr_empty_pop_pages);
+ P("first_bit", chunk_md->first_free);
+ P("free_bytes", chunk->free_bytes);
+ P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
+ P("sum_frag", sum_frag);
+ P("max_frag", max_frag);
+ P("cur_min_alloc", cur_min_alloc);
+ P("cur_med_alloc", cur_med_alloc);
+ P("cur_max_alloc", cur_max_alloc);
+#ifdef CONFIG_MEMCG_KMEM
+ P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
+#endif
+ seq_putc(m, '\n');
+}
+
+static int percpu_stats_show(struct seq_file *m, void *v)
+{
+ struct pcpu_chunk *chunk;
+ int slot, max_nr_alloc;
+ int *buffer;
+ enum pcpu_chunk_type type;
+ int nr_empty_pop_pages;
+
+alloc_buffer:
+ spin_lock_irq(&pcpu_lock);
+ max_nr_alloc = find_max_nr_alloc();
+ spin_unlock_irq(&pcpu_lock);
+
+ /* there can be at most this many free and allocated fragments */
+ buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
+ if (!buffer)
+ return -ENOMEM;
+
+ spin_lock_irq(&pcpu_lock);
+
+ /* if the buffer allocated earlier is too small */
+ if (max_nr_alloc < find_max_nr_alloc()) {
+ spin_unlock_irq(&pcpu_lock);
+ vfree(buffer);
+ goto alloc_buffer;
+ }
+
+ nr_empty_pop_pages = 0;
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type];
+
+#define PL(X) \
+ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
+
+ seq_printf(m,
+ "Percpu Memory Statistics\n"
+ "Allocation Info:\n"
+ "----------------------------------------\n");
+ PL(unit_size);
+ PL(static_size);
+ PL(reserved_size);
+ PL(dyn_size);
+ PL(atom_size);
+ PL(alloc_size);
+ seq_putc(m, '\n');
+
+#undef PL
+
+#define PU(X) \
+ seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
+
+ seq_printf(m,
+ "Global Stats:\n"
+ "----------------------------------------\n");
+ PU(nr_alloc);
+ PU(nr_dealloc);
+ PU(nr_cur_alloc);
+ PU(nr_max_alloc);
+ PU(nr_chunks);
+ PU(nr_max_chunks);
+ PU(min_alloc_size);
+ PU(max_alloc_size);
+ P("empty_pop_pages", nr_empty_pop_pages);
+ seq_putc(m, '\n');
+
+#undef PU
+
+ seq_printf(m,
+ "Per Chunk Stats:\n"
+ "----------------------------------------\n");
+
+ if (pcpu_reserved_chunk) {
+ seq_puts(m, "Chunk: <- Reserved Chunk\n");
+ chunk_map_stats(m, pcpu_reserved_chunk, buffer);
+ }
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
+ }
+ }
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ vfree(buffer);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(percpu_stats);
+
+static int __init init_percpu_stats_debugfs(void)
+{
+ debugfs_create_file("percpu_stats", 0444, NULL, NULL,
+ &percpu_stats_fops);
+
+ return 0;
+}
+
+late_initcall(init_percpu_stats_debugfs);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 000000000..e46f7a691
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+
+/**
+ * pcpu_get_pages - get temp pages array
+ *
+ * Returns pointer to array of pointers to struct page which can be indexed
+ * with pcpu_page_idx(). Note that there is only one array and accesses
+ * should be serialized by pcpu_alloc_mutex.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success.
+ */
+static struct page **pcpu_get_pages(void)
+{
+ static struct page **pages;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+
+ lockdep_assert_held(&pcpu_alloc_mutex);
+
+ if (!pages)
+ pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
+ return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ * @gfp: allocation flags passed to the underlying allocator
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, int page_start, int page_end,
+ gfp_t gfp)
+{
+ unsigned int cpu, tcpu;
+ int i;
+
+ gfp |= __GFP_HIGHMEM;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep)
+ goto err;
+ }
+ }
+ return 0;
+
+err:
+ while (--i >= page_start)
+ __free_page(pages[pcpu_page_idx(cpu, i)]);
+
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ for (i = page_start; i < page_end; i++)
+ __free_page(pages[pcpu_page_idx(tcpu, i)]);
+ }
+ return -ENOMEM;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
+ }
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
+ }
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting up whatever is necessary for
+ * reverse lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, int page_start, int page_end)
+{
+ unsigned int cpu, tcpu;
+ int i, err;
+
+ for_each_possible_cpu(cpu) {
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
+ if (err < 0)
+ goto err;
+
+ for (i = page_start; i < page_end; i++)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ }
+ return 0;
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: the start page
+ * @page_end: the end page
+ * @gfp: allocation flags passed to the underlying memory allocator
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end, gfp_t gfp)
+{
+ struct page **pages;
+
+ pages = pcpu_get_pages();
+ if (!pages)
+ return -ENOMEM;
+
+ if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
+ return -ENOMEM;
+
+ if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
+ pcpu_free_pages(chunk, pages, page_start, page_end);
+ return -ENOMEM;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+
+ return 0;
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ struct page **pages;
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages();
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_unmap_pages(chunk, pages, page_start, page_end);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_free_pages(chunk, pages, page_start, page_end);
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
+{
+ struct pcpu_chunk *chunk;
+ struct vm_struct **vms;
+
+ chunk = pcpu_alloc_chunk(type, gfp);
+ if (!chunk)
+ return NULL;
+
+ vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+ pcpu_nr_groups, pcpu_atom_size);
+ if (!vms) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ chunk->data = vms;
+ chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(chunk->base_addr);
+
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ if (!chunk)
+ return;
+
+ pcpu_stats_chunk_dealloc();
+ trace_percpu_destroy_chunk(chunk->base_addr);
+
+ if (chunk->data)
+ pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return vmalloc_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ /* no extra restriction */
+ return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000..e12ab708f
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,3181 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009 SUSE Linux Products GmbH
+ * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
+ *
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
+ *
+ * The percpu allocator handles both static and dynamic areas. Percpu
+ * areas are allocated in chunks which are divided into units. There is
+ * a 1-to-1 mapping for units to possible cpus. These units are grouped
+ * based on NUMA properties of the machine.
+ *
+ * c0 c1 c2
+ * ------------------- ------------------- ------------
+ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
+ * ------------------- ...... ------------------- .... ------------
+ *
+ * Allocation is done by offsets into a unit's address space. Ie., an
+ * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
+ * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
+ * and even sparse. Access is handled by configuring percpu base
+ * registers according to the cpu to unit mappings and offsetting the
+ * base address using pcpu_unit_size.
+ *
+ * There is special consideration for the first chunk which must handle
+ * the static percpu variables in the kernel image as allocation services
+ * are not online yet. In short, the first chunk is structured like so:
+ *
+ * <Static | [Reserved] | Dynamic>
+ *
+ * The static data is copied from the original section managed by the
+ * linker. The reserved section, if non-zero, primarily manages static
+ * percpu variables from kernel modules. Finally, the dynamic section
+ * takes care of normal allocations.
+ *
+ * The allocator organizes chunks into lists according to free size and
+ * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
+ * flag should be passed. All memcg-aware allocations are sharing one set
+ * of chunks and all unaccounted allocations and allocations performed
+ * by processes belonging to the root memory cgroup are using the second set.
+ *
+ * The allocator tries to allocate from the fullest chunk first. Each chunk
+ * is managed by a bitmap with metadata blocks. The allocation map is updated
+ * on every allocation and free to reflect the current state while the boundary
+ * map is only updated on allocation. Each metadata block contains
+ * information to help mitigate the need to iterate over large portions
+ * of the bitmap. The reverse mapping from page to chunk is stored in
+ * the page's index. Lastly, units are lazily backed and grow in unison.
+ *
+ * There is a unique conversion that goes on here between bytes and bits.
+ * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
+ * tracks the number of pages it is responsible for in nr_pages. Helper
+ * functions are used to convert from between the bytes, bits, and blocks.
+ * All hints are managed in bits unless explicitly stated.
+ *
+ * To use this allocator, arch code should do the following:
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ * regular address to percpu pointer and back if they need to be
+ * different from the default
+ *
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ * setup the first chunk containing the kernel static percpu area
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/lcm.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/kmemleak.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/memcontrol.h>
+
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/percpu.h>
+
+#include "percpu-internal.h"
+
+/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+#define PCPU_SLOT_BASE_SHIFT 5
+/* chunks in slots below this are subject to being sidelined on failed alloc */
+#define PCPU_SLOT_FAIL_THRESHOLD 3
+
+#define PCPU_EMPTY_POP_PAGES_LOW 2
+#define PCPU_EMPTY_POP_PAGES_HIGH 4
+
+#ifdef CONFIG_SMP
+/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
+#ifndef __addr_to_pcpu_ptr
+#define __addr_to_pcpu_ptr(addr) \
+ (void __percpu *)((unsigned long)(addr) - \
+ (unsigned long)pcpu_base_addr + \
+ (unsigned long)__per_cpu_start)
+#endif
+#ifndef __pcpu_ptr_to_addr
+#define __pcpu_ptr_to_addr(ptr) \
+ (void __force *)((unsigned long)(ptr) + \
+ (unsigned long)pcpu_base_addr - \
+ (unsigned long)__per_cpu_start)
+#endif
+#else /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
+#endif /* CONFIG_SMP */
+
+static int pcpu_unit_pages __ro_after_init;
+static int pcpu_unit_size __ro_after_init;
+static int pcpu_nr_units __ro_after_init;
+static int pcpu_atom_size __ro_after_init;
+int pcpu_nr_slots __ro_after_init;
+static size_t pcpu_chunk_struct_size __ro_after_init;
+
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __ro_after_init;
+static unsigned int pcpu_high_unit_cpu __ro_after_init;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr __ro_after_init;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
+
+/* group information, used for vm allocation */
+static int pcpu_nr_groups __ro_after_init;
+static const unsigned long *pcpu_group_offsets __ro_after_init;
+static const size_t *pcpu_group_sizes __ro_after_init;
+
+/*
+ * The first chunk which always exists. Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
+
+/*
+ * Optional reserved chunk. This chunk reserves part of the first
+ * chunk and serves it for reserved allocations. When the reserved
+ * region doesn't exist, the following variable is NULL.
+ */
+struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
+
+DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
+
+struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
+
+/* chunks which need their map areas extended, protected by pcpu_lock */
+static LIST_HEAD(pcpu_map_extend_chunks);
+
+/*
+ * The number of empty populated pages by chunk type, protected by pcpu_lock.
+ * The reserved chunk doesn't contribute to the count.
+ */
+int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
+
+/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
+
+/*
+ * Balance work is used to populate or destroy chunks asynchronously. We
+ * try to keep the number of populated free pages between
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
+ * empty chunk.
+ */
+static void pcpu_balance_workfn(struct work_struct *work);
+static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
+static bool pcpu_async_enabled __read_mostly;
+static bool pcpu_atomic_alloc_failed;
+
+static void pcpu_schedule_balance_work(void)
+{
+ if (pcpu_async_enabled)
+ schedule_work(&pcpu_balance_work);
+}
+
+/**
+ * pcpu_addr_in_chunk - check if the address is served from this chunk
+ * @chunk: chunk of interest
+ * @addr: percpu address
+ *
+ * RETURNS:
+ * True if the address is served from this chunk.
+ */
+static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
+{
+ void *start_addr, *end_addr;
+
+ if (!chunk)
+ return false;
+
+ start_addr = chunk->base_addr + chunk->start_offset;
+ end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
+ chunk->end_offset;
+
+ return addr >= start_addr && addr < end_addr;
+}
+
+static int __pcpu_size_to_slot(int size)
+{
+ int highbit = fls(size); /* size is in bytes */
+ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_size_to_slot(int size)
+{
+ if (size == pcpu_unit_size)
+ return pcpu_nr_slots - 1;
+ return __pcpu_size_to_slot(size);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+ const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
+ chunk_md->contig_hint == 0)
+ return 0;
+
+ return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
+}
+
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+ page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+ return (struct pcpu_chunk *)page->index;
+}
+
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+
+static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return (unsigned long)chunk->base_addr +
+ pcpu_unit_page_offset(cpu, page_idx);
+}
+
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
+{
+ return chunk->alloc_map +
+ (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
+}
+
+static unsigned long pcpu_off_to_block_index(int off)
+{
+ return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+ return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+ return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
+/*
+ * pcpu_next_hint - determine which hint to use
+ * @block: block of interest
+ * @alloc_bits: size of allocation
+ *
+ * This determines if we should scan based on the scan_hint or first_free.
+ * In general, we want to scan from first_free to fulfill allocations by
+ * first fit. However, if we know a scan_hint at position scan_hint_start
+ * cannot fulfill an allocation, we can begin scanning from there knowing
+ * the contig_hint will be our fallback.
+ */
+static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
+{
+ /*
+ * The three conditions below determine if we can skip past the
+ * scan_hint. First, does the scan hint exist. Second, is the
+ * contig_hint after the scan_hint (possibly not true iff
+ * contig_hint == scan_hint). Third, is the allocation request
+ * larger than the scan_hint.
+ */
+ if (block->scan_hint &&
+ block->contig_hint_start > block->scan_hint_start &&
+ alloc_bits > block->scan_hint)
+ return block->scan_hint_start + block->scan_hint;
+
+ return block->first_free;
+}
+
+/**
+ * pcpu_next_md_free_region - finds the next hint free area
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Helper function for pcpu_for_each_md_free_region. It checks
+ * block->contig_hint and performs aggregation across blocks to find the
+ * next hint. It modifies bit_off and bits in-place to be consumed in the
+ * loop.
+ */
+static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
+ int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ return;
+ }
+
+ /*
+ * This checks three things. First is there a contig_hint to
+ * check. Second, have we checked this hint before by
+ * comparing the block_off. Third, is this the same as the
+ * right contig hint. In the last case, it spills over into
+ * the next block and should be handled by the contig area
+ * across blocks code.
+ */
+ *bits = block->contig_hint;
+ if (*bits && block->contig_hint_start >= block_off &&
+ *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
+ *bit_off = pcpu_block_off_to_off(i,
+ block->contig_hint_start);
+ return;
+ }
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
+
+ *bits = block->right_free;
+ *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
+ }
+}
+
+/**
+ * pcpu_next_fit_region - finds fit areas for a given allocation request
+ * @chunk: chunk of interest
+ * @alloc_bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finds the next free region that is viable for use with a given size and
+ * alignment. This only returns if there is a valid area to be used for this
+ * allocation. block->first_free is returned if the allocation request fits
+ * within the block to see if the request can be fulfilled prior to the contig
+ * hint.
+ */
+static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
+ int align, int *bit_off, int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (*bits >= alloc_bits)
+ return;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ }
+
+ /* check block->contig_hint */
+ *bits = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+ /*
+ * This uses the block offset to determine if this has been
+ * checked in the prior iteration.
+ */
+ if (block->contig_hint &&
+ block->contig_hint_start >= block_off &&
+ block->contig_hint >= *bits + alloc_bits) {
+ int start = pcpu_next_hint(block, alloc_bits);
+
+ *bits += alloc_bits + block->contig_hint_start -
+ start;
+ *bit_off = pcpu_block_off_to_off(i, start);
+ return;
+ }
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
+
+ *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
+ align);
+ *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
+ *bit_off = pcpu_block_off_to_off(i, *bit_off);
+ if (*bits >= alloc_bits)
+ return;
+ }
+
+ /* no valid offsets were found - fail condition */
+ *bit_off = pcpu_chunk_map_bits(chunk);
+}
+
+/*
+ * Metadata free area iterators. These perform aggregation of free areas
+ * based on the metadata blocks and return the offset @bit_off and size in
+ * bits of the free area @bits. pcpu_for_each_fit_region only returns when
+ * a fit is found for the allocation request.
+ */
+#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
+ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits) + 1, \
+ pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
+
+#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
+ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits), \
+ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)))
+
+/**
+ * pcpu_mem_zalloc - allocate memory
+ * @size: bytes to allocate
+ * @gfp: allocation flags
+ *
+ * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags. The
+ * returned memory is always zeroed.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
+{
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, gfp);
+ else
+ return __vmalloc(size, gfp | __GFP_ZERO);
+}
+
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ *
+ * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
+ */
+static void pcpu_mem_free(void *ptr)
+{
+ kvfree(ptr);
+}
+
+static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
+ bool move_front)
+{
+ if (chunk != pcpu_reserved_chunk) {
+ struct list_head *pcpu_slot;
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
+ if (move_front)
+ list_move(&chunk->list, &pcpu_slot[slot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[slot]);
+ }
+}
+
+static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
+{
+ __pcpu_chunk_move(chunk, slot, true);
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot. Note that the reserved chunk is never put on
+ * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+ int nslot = pcpu_chunk_slot(chunk);
+
+ if (oslot != nslot)
+ __pcpu_chunk_move(chunk, nslot, oslot < nslot);
+}
+
+/*
+ * pcpu_update_empty_pages - update empty page counters
+ * @chunk: chunk of interest
+ * @nr: nr of empty pages
+ *
+ * This is used to keep track of the empty pages now based on the premise
+ * a md_block covers a page. The hint update functions recognize if a block
+ * is made full or broken to calculate deltas for keeping track of free pages.
+ */
+static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
+{
+ chunk->nr_empty_pop_pages += nr;
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
+}
+
+/*
+ * pcpu_region_overlap - determines if two regions overlap
+ * @a: start of first region, inclusive
+ * @b: end of first region, exclusive
+ * @x: start of second region, inclusive
+ * @y: end of second region, exclusive
+ *
+ * This is used to determine if the hint region [a, b) overlaps with the
+ * allocated region [x, y).
+ */
+static inline bool pcpu_region_overlap(int a, int b, int x, int y)
+{
+ return (a < y) && (x < b);
+}
+
+/**
+ * pcpu_block_update - updates a block given a free area
+ * @block: block of interest
+ * @start: start offset in block
+ * @end: end offset in block
+ *
+ * Updates a block given a known free area. The region [start, end) is
+ * expected to be the entirety of the free area within a block. Chooses
+ * the best starting offset if the contig hints are equal.
+ */
+static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
+{
+ int contig = end - start;
+
+ block->first_free = min(block->first_free, start);
+ if (start == 0)
+ block->left_free = contig;
+
+ if (end == block->nr_bits)
+ block->right_free = contig;
+
+ if (contig > block->contig_hint) {
+ /* promote the old contig_hint to be the new scan_hint */
+ if (start > block->contig_hint_start) {
+ if (block->contig_hint > block->scan_hint) {
+ block->scan_hint_start =
+ block->contig_hint_start;
+ block->scan_hint = block->contig_hint;
+ } else if (start < block->scan_hint_start) {
+ /*
+ * The old contig_hint == scan_hint. But, the
+ * new contig is larger so hold the invariant
+ * scan_hint_start < contig_hint_start.
+ */
+ block->scan_hint = 0;
+ }
+ } else {
+ block->scan_hint = 0;
+ }
+ block->contig_hint_start = start;
+ block->contig_hint = contig;
+ } else if (contig == block->contig_hint) {
+ if (block->contig_hint_start &&
+ (!start ||
+ __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* start has a better alignment so use it */
+ block->contig_hint_start = start;
+ if (start < block->scan_hint_start &&
+ block->contig_hint > block->scan_hint)
+ block->scan_hint = 0;
+ } else if (start > block->scan_hint_start ||
+ block->contig_hint > block->scan_hint) {
+ /*
+ * Knowing contig == contig_hint, update the scan_hint
+ * if it is farther than or larger than the current
+ * scan_hint.
+ */
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ } else {
+ /*
+ * The region is smaller than the contig_hint. So only update
+ * the scan_hint if it is larger than or equal and farther than
+ * the current scan_hint.
+ */
+ if ((start < block->contig_hint_start &&
+ (contig > block->scan_hint ||
+ (contig == block->scan_hint &&
+ start > block->scan_hint_start)))) {
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ }
+}
+
+/*
+ * pcpu_block_update_scan - update a block given a free area from a scan
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finding the final allocation spot first goes through pcpu_find_block_fit()
+ * to find a block that can hold the allocation and then pcpu_alloc_area()
+ * where a scan is used. When allocations require specific alignments,
+ * we can inadvertently create holes which will not be seen in the alloc
+ * or free paths.
+ *
+ * This takes a given free area hole and updates a block as it may change the
+ * scan_hint. We need to scan backwards to ensure we don't miss free bits
+ * from alignment.
+ */
+static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int s_off = pcpu_off_to_block_off(bit_off);
+ int e_off = s_off + bits;
+ int s_index, l_bit;
+ struct pcpu_block_md *block;
+
+ if (e_off > PCPU_BITMAP_BLOCK_BITS)
+ return;
+
+ s_index = pcpu_off_to_block_index(bit_off);
+ block = chunk->md_blocks + s_index;
+
+ /* scan backwards in case of alignment skipping free bits */
+ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
+ s_off = (s_off == l_bit) ? 0 : l_bit + 1;
+
+ pcpu_block_update(block, s_off, e_off);
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
+ * @full_scan: if we should scan from the beginning
+ *
+ * Iterates over the metadata blocks to find the largest contig area.
+ * A full scan can be avoided on the allocation path as this is triggered
+ * if we broke the contig_hint. In doing so, the scan_hint will be before
+ * the contig_hint or after if the scan_hint == contig_hint. This cannot
+ * be prevented on freeing as we want to find the largest area possibly
+ * spanning blocks.
+ */
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits;
+
+ /* promote scan_hint to contig_hint */
+ if (!full_scan && chunk_md->scan_hint) {
+ bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
+ chunk_md->contig_hint_start = chunk_md->scan_hint_start;
+ chunk_md->contig_hint = chunk_md->scan_hint;
+ chunk_md->scan_hint = 0;
+ } else {
+ bit_off = chunk_md->first_free;
+ chunk_md->contig_hint = 0;
+ }
+
+ bits = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits)
+ pcpu_block_update(chunk_md, bit_off, bit_off + bits);
+}
+
+/**
+ * pcpu_block_refresh_hint
+ * @chunk: chunk of interest
+ * @index: index of the metadata block
+ *
+ * Scans over the block beginning at first_free and updates the block
+ * metadata accordingly.
+ */
+static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
+{
+ struct pcpu_block_md *block = chunk->md_blocks + index;
+ unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
+ unsigned int rs, re, start; /* region start, region end */
+
+ /* promote scan_hint to contig_hint */
+ if (block->scan_hint) {
+ start = block->scan_hint_start + block->scan_hint;
+ block->contig_hint_start = block->scan_hint_start;
+ block->contig_hint = block->scan_hint;
+ block->scan_hint = 0;
+ } else {
+ start = block->first_free;
+ block->contig_hint = 0;
+ }
+
+ block->right_free = 0;
+
+ /* iterate over free areas and update the contig hints */
+ bitmap_for_each_clear_region(alloc_map, rs, re, start,
+ PCPU_BITMAP_BLOCK_BITS)
+ pcpu_block_update(block, rs, re);
+}
+
+/**
+ * pcpu_block_update_hint_alloc - update hint on allocation path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. The metadata only has to be
+ * refreshed by a full scan iff the chunk's contig hint is broken. Block level
+ * scans are required if the block's contig hint is broken.
+ */
+static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int nr_empty_pages = 0;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
+
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
+
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+
+ /*
+ * Update s_block.
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
+ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
+ if (s_off == s_block->first_free)
+ s_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, s_index),
+ PCPU_BITMAP_BLOCK_BITS,
+ s_off + bits);
+
+ if (pcpu_region_overlap(s_block->scan_hint_start,
+ s_block->scan_hint_start + s_block->scan_hint,
+ s_off,
+ s_off + bits))
+ s_block->scan_hint = 0;
+
+ if (pcpu_region_overlap(s_block->contig_hint_start,
+ s_block->contig_hint_start +
+ s_block->contig_hint,
+ s_off,
+ s_off + bits)) {
+ /* block contig hint is broken - scan to fix it */
+ if (!s_off)
+ s_block->left_free = 0;
+ pcpu_block_refresh_hint(chunk, s_index);
+ } else {
+ /* update left and right contig manually */
+ s_block->left_free = min(s_block->left_free, s_off);
+ if (s_index == e_index)
+ s_block->right_free = min_t(int, s_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ else
+ s_block->right_free = 0;
+ }
+
+ /*
+ * Update e_block.
+ */
+ if (s_index != e_index) {
+ if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
+ /*
+ * When the allocation is across blocks, the end is along
+ * the left part of the e_block.
+ */
+ e_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, e_off);
+
+ if (e_off == PCPU_BITMAP_BLOCK_BITS) {
+ /* reset the block */
+ e_block++;
+ } else {
+ if (e_off > e_block->scan_hint_start)
+ e_block->scan_hint = 0;
+
+ e_block->left_free = 0;
+ if (e_off > e_block->contig_hint_start) {
+ /* contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, e_index);
+ } else {
+ e_block->right_free =
+ min_t(int, e_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ }
+ }
+
+ /* update in-between md_blocks */
+ nr_empty_pages += (e_index - s_index - 1);
+ for (block = s_block + 1; block < e_block; block++) {
+ block->scan_hint = 0;
+ block->contig_hint = 0;
+ block->left_free = 0;
+ block->right_free = 0;
+ }
+ }
+
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, -nr_empty_pages);
+
+ if (pcpu_region_overlap(chunk_md->scan_hint_start,
+ chunk_md->scan_hint_start +
+ chunk_md->scan_hint,
+ bit_off,
+ bit_off + bits))
+ chunk_md->scan_hint = 0;
+
+ /*
+ * The only time a full chunk scan is required is if the chunk
+ * contig hint is broken. Otherwise, it means a smaller space
+ * was used and therefore the chunk contig hint is still correct.
+ */
+ if (pcpu_region_overlap(chunk_md->contig_hint_start,
+ chunk_md->contig_hint_start +
+ chunk_md->contig_hint,
+ bit_off,
+ bit_off + bits))
+ pcpu_chunk_refresh_hint(chunk, false);
+}
+
+/**
+ * pcpu_block_update_hint_free - updates the block hints on the free path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. This avoids a blind block
+ * refresh by making use of the block contig hints. If this fails, it scans
+ * forward and backward to determine the extent of the free area. This is
+ * capped at the boundary of blocks.
+ *
+ * A chunk update is triggered if a page becomes free, a block becomes free,
+ * or the free spans across blocks. This tradeoff is to minimize iterating
+ * over the block metadata to update chunk_md->contig_hint.
+ * chunk_md->contig_hint may be off by up to a page, but it will never be more
+ * than the available space. If the contig hint is contained in one block, it
+ * will be accurate.
+ */
+static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int nr_empty_pages = 0;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
+ int start, end; /* start and end of the whole free area */
+
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
+
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+
+ /*
+ * Check if the freed area aligns with the block->contig_hint.
+ * If it does, then the scan to find the beginning/end of the
+ * larger free area can be avoided.
+ *
+ * start and end refer to beginning and end of the free area
+ * within each their respective blocks. This is not necessarily
+ * the entire free area as it may span blocks past the beginning
+ * or end of the block.
+ */
+ start = s_off;
+ if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
+ start = s_block->contig_hint_start;
+ } else {
+ /*
+ * Scan backwards to find the extent of the free area.
+ * find_last_bit returns the starting bit, so if the start bit
+ * is returned, that means there was no last bit and the
+ * remainder of the chunk is free.
+ */
+ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
+ start);
+ start = (start == l_bit) ? 0 : l_bit + 1;
+ }
+
+ end = e_off;
+ if (e_off == e_block->contig_hint_start)
+ end = e_block->contig_hint_start + e_block->contig_hint;
+ else
+ end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, end);
+
+ /* update s_block */
+ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+ pcpu_block_update(s_block, start, e_off);
+
+ /* freeing in the same block */
+ if (s_index != e_index) {
+ /* update e_block */
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+ pcpu_block_update(e_block, 0, end);
+
+ /* reset md_blocks in the middle */
+ nr_empty_pages += (e_index - s_index - 1);
+ for (block = s_block + 1; block < e_block; block++) {
+ block->first_free = 0;
+ block->scan_hint = 0;
+ block->contig_hint_start = 0;
+ block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ block->right_free = PCPU_BITMAP_BLOCK_BITS;
+ }
+ }
+
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, nr_empty_pages);
+
+ /*
+ * Refresh chunk metadata when the free makes a block free or spans
+ * across blocks. The contig_hint may be off by up to a page, but if
+ * the contig_hint is contained in a block, it will be accurate with
+ * the else condition below.
+ */
+ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk, true);
+ else
+ pcpu_block_update(&chunk->chunk_md,
+ pcpu_block_off_to_off(s_index, start),
+ end);
+}
+
+/**
+ * pcpu_is_populated - determines if the region is populated
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of area
+ * @next_off: return value for the next offset to start searching
+ *
+ * For atomic allocations, check if the backing pages are populated.
+ *
+ * RETURNS:
+ * Bool if the backing pages are populated.
+ * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
+ */
+static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
+ int *next_off)
+{
+ unsigned int page_start, page_end, rs, re;
+
+ page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+ page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
+
+ rs = page_start;
+ bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
+ if (rs >= page_end)
+ return true;
+
+ *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+ return false;
+}
+
+/**
+ * pcpu_find_block_fit - finds the block index to start searching
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE bytes)
+ * @pop_only: use populated regions only
+ *
+ * Given a chunk and an allocation spec, find the offset to begin searching
+ * for a free region. This iterates over the bitmap metadata blocks to
+ * find an offset that will be guaranteed to fit the requirements. It is
+ * not quite first fit as if the allocation does not fit in the contig hint
+ * of a block or chunk, it is skipped. This errs on the side of caution
+ * to prevent excess iteration. Poor alignment can cause the allocator to
+ * skip over blocks and chunks that have valid free areas.
+ *
+ * RETURNS:
+ * The offset in the bitmap to begin searching.
+ * -1 if no offset is found.
+ */
+static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, bool pop_only)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits, next_off;
+
+ /*
+ * Check to see if the allocation can fit in the chunk's contig hint.
+ * This is an optimization to prevent scanning by assuming if it
+ * cannot fit in the global hint, there is memory pressure and creating
+ * a new chunk would happen soon.
+ */
+ bit_off = ALIGN(chunk_md->contig_hint_start, align) -
+ chunk_md->contig_hint_start;
+ if (bit_off + alloc_bits > chunk_md->contig_hint)
+ return -1;
+
+ bit_off = pcpu_next_hint(chunk_md, alloc_bits);
+ bits = 0;
+ pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
+ if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
+ &next_off))
+ break;
+
+ bit_off = next_off;
+ bits = 0;
+ }
+
+ if (bit_off == pcpu_chunk_map_bits(chunk))
+ return -1;
+
+ return bit_off;
+}
+
+/*
+ * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
+ * @map: the address to base the search on
+ * @size: the bitmap size in bits
+ * @start: the bitnumber to start searching at
+ * @nr: the number of zeroed bits we're looking for
+ * @align_mask: alignment mask for zero area
+ * @largest_off: offset of the largest area skipped
+ * @largest_bits: size of the largest area skipped
+ *
+ * The @align_mask should be one less than a power of 2.
+ *
+ * This is a modified version of bitmap_find_next_zero_area_off() to remember
+ * the largest area that was skipped. This is imperfect, but in general is
+ * good enough. The largest remembered region is the largest failed region
+ * seen. This does not include anything we possibly skipped due to alignment.
+ * pcpu_block_update_scan() does scan backwards to try and recover what was
+ * lost to alignment. While this can cause scanning to miss earlier possible
+ * free areas, smaller allocations will eventually fill those holes.
+ */
+static unsigned long pcpu_find_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask,
+ unsigned long *largest_off,
+ unsigned long *largest_bits)
+{
+ unsigned long index, end, i, area_off, area_bits;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = __ALIGN_MASK(index, align_mask);
+ area_off = index;
+
+ end = index + nr;
+ if (end > size)
+ return end;
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ area_bits = i - area_off;
+ /* remember largest unused area with best alignment */
+ if (area_bits > *largest_bits ||
+ (area_bits == *largest_bits && *largest_off &&
+ (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
+ *largest_off = area_off;
+ *largest_bits = area_bits;
+ }
+
+ start = i + 1;
+ goto again;
+ }
+ return index;
+}
+
+/**
+ * pcpu_alloc_area - allocates an area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE)
+ * @start: bit_off to start searching
+ *
+ * This function takes in a @start offset to begin searching to fit an
+ * allocation of @alloc_bits with alignment @align. It needs to scan
+ * the allocation map because if it fits within the block's contig hint,
+ * @start will be block->first_free. This is an attempt to fill the
+ * allocation prior to breaking the contig hint. The allocation and
+ * boundary maps are updated accordingly if it confirms a valid
+ * free area.
+ *
+ * RETURNS:
+ * Allocated addr offset in @chunk on success.
+ * -1 if no matching area is found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, int start)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ size_t align_mask = (align) ? (align - 1) : 0;
+ unsigned long area_off = 0, area_bits = 0;
+ int bit_off, end, oslot;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ oslot = pcpu_chunk_slot(chunk);
+
+ /*
+ * Search to find a fit.
+ */
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
+ bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
+ align_mask, &area_off, &area_bits);
+ if (bit_off >= end)
+ return -1;
+
+ if (area_bits)
+ pcpu_block_update_scan(chunk, area_off, area_bits);
+
+ /* update alloc map */
+ bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
+
+ /* update boundary map */
+ set_bit(bit_off, chunk->bound_map);
+ bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
+ set_bit(bit_off + alloc_bits, chunk->bound_map);
+
+ chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ if (bit_off == chunk_md->first_free)
+ chunk_md->first_free = find_next_zero_bit(
+ chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk),
+ bit_off + alloc_bits);
+
+ pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
+
+ pcpu_chunk_relocate(chunk, oslot);
+
+ return bit_off * PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_free_area - frees the corresponding offset
+ * @chunk: chunk of interest
+ * @off: addr offset into chunk
+ *
+ * This function determines the size of an allocation to free using
+ * the boundary bitmap and clears the allocation map.
+ *
+ * RETURNS:
+ * Number of freed bytes.
+ */
+static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits, end, oslot, freed;
+
+ lockdep_assert_held(&pcpu_lock);
+ pcpu_stats_area_dealloc(chunk);
+
+ oslot = pcpu_chunk_slot(chunk);
+
+ bit_off = off / PCPU_MIN_ALLOC_SIZE;
+
+ /* find end index */
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ bits = end - bit_off;
+ bitmap_clear(chunk->alloc_map, bit_off, bits);
+
+ freed = bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update metadata */
+ chunk->free_bytes += freed;
+
+ /* update first free bit */
+ chunk_md->first_free = min(chunk_md->first_free, bit_off);
+
+ pcpu_block_update_hint_free(chunk, bit_off, bits);
+
+ pcpu_chunk_relocate(chunk, oslot);
+
+ return freed;
+}
+
+static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
+{
+ block->scan_hint = 0;
+ block->contig_hint = nr_bits;
+ block->left_free = nr_bits;
+ block->right_free = nr_bits;
+ block->first_free = 0;
+ block->nr_bits = nr_bits;
+}
+
+static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
+{
+ struct pcpu_block_md *md_block;
+
+ /* init the chunk's block */
+ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
+
+ for (md_block = chunk->md_blocks;
+ md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
+ md_block++)
+ pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
+}
+
+/**
+ * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
+ * @tmp_addr: the start of the region served
+ * @map_size: size of the region served
+ *
+ * This is responsible for creating the chunks that serve the first chunk. The
+ * base_addr is page aligned down of @tmp_addr while the region end is page
+ * aligned up. Offsets are kept track of to determine the region served. All
+ * this is done to appease the bitmap allocator in avoiding partial blocks.
+ *
+ * RETURNS:
+ * Chunk serving the region at @tmp_addr of @map_size.
+ */
+static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
+ int map_size)
+{
+ struct pcpu_chunk *chunk;
+ unsigned long aligned_addr, lcm_align;
+ int start_offset, offset_bits, region_size, region_bits;
+ size_t alloc_size;
+
+ /* region calculations */
+ aligned_addr = tmp_addr & PAGE_MASK;
+
+ start_offset = tmp_addr - aligned_addr;
+
+ /*
+ * Align the end of the region with the LCM of PAGE_SIZE and
+ * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
+ * the other.
+ */
+ lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
+ region_size = ALIGN(start_offset + map_size, lcm_align);
+
+ /* allocate chunk */
+ alloc_size = struct_size(chunk, populated,
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT));
+ chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ INIT_LIST_HEAD(&chunk->list);
+
+ chunk->base_addr = (void *)aligned_addr;
+ chunk->start_offset = start_offset;
+ chunk->end_offset = region_size - chunk->start_offset - map_size;
+
+ chunk->nr_pages = region_size >> PAGE_SHIFT;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
+ chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->alloc_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size =
+ BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
+ chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->bound_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
+ chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->md_blocks)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+#ifdef CONFIG_MEMCG_KMEM
+ /* first chunk isn't memcg-aware */
+ chunk->obj_cgroups = NULL;
+#endif
+ pcpu_init_md_blocks(chunk);
+
+ /* manage populated page bitmap */
+ chunk->immutable = true;
+ bitmap_fill(chunk->populated, chunk->nr_pages);
+ chunk->nr_populated = chunk->nr_pages;
+ chunk->nr_empty_pop_pages = chunk->nr_pages;
+
+ chunk->free_bytes = map_size;
+
+ if (chunk->start_offset) {
+ /* hide the beginning of the bitmap */
+ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map, 0, offset_bits);
+ set_bit(0, chunk->bound_map);
+ set_bit(offset_bits, chunk->bound_map);
+
+ chunk->chunk_md.first_free = offset_bits;
+
+ pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
+ }
+
+ if (chunk->end_offset) {
+ /* hide the end of the bitmap */
+ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) - offset_bits,
+ offset_bits);
+ set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
+ chunk->bound_map);
+ set_bit(region_bits, chunk->bound_map);
+
+ pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
+ - offset_bits, offset_bits);
+ }
+
+ return chunk;
+}
+
+static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
+{
+ struct pcpu_chunk *chunk;
+ int region_bits;
+
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->nr_pages = pcpu_unit_pages;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]), gfp);
+ if (!chunk->alloc_map)
+ goto alloc_map_fail;
+
+ chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]), gfp);
+ if (!chunk->bound_map)
+ goto bound_map_fail;
+
+ chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]), gfp);
+ if (!chunk->md_blocks)
+ goto md_blocks_fail;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (pcpu_is_memcg_chunk(type)) {
+ chunk->obj_cgroups =
+ pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
+ sizeof(struct obj_cgroup *), gfp);
+ if (!chunk->obj_cgroups)
+ goto objcg_fail;
+ }
+#endif
+
+ pcpu_init_md_blocks(chunk);
+
+ /* init metadata */
+ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
+
+ return chunk;
+
+#ifdef CONFIG_MEMCG_KMEM
+objcg_fail:
+ pcpu_mem_free(chunk->md_blocks);
+#endif
+md_blocks_fail:
+ pcpu_mem_free(chunk->bound_map);
+bound_map_fail:
+ pcpu_mem_free(chunk->alloc_map);
+alloc_map_fail:
+ pcpu_mem_free(chunk);
+
+ return NULL;
+}
+
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
+{
+ if (!chunk)
+ return;
+#ifdef CONFIG_MEMCG_KMEM
+ pcpu_mem_free(chunk->obj_cgroups);
+#endif
+ pcpu_mem_free(chunk->md_blocks);
+ pcpu_mem_free(chunk->bound_map);
+ pcpu_mem_free(chunk->alloc_map);
+ pcpu_mem_free(chunk);
+}
+
+/**
+ * pcpu_chunk_populated - post-population bookkeeping
+ * @chunk: pcpu_chunk which got populated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been populated to @chunk. Update
+ * the bookkeeping information accordingly. Must be called after each
+ * successful population.
+ *
+ * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
+ * is to serve an allocation in that area.
+ */
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
+ int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_set(chunk->populated, page_start, nr);
+ chunk->nr_populated += nr;
+ pcpu_nr_populated += nr;
+
+ pcpu_update_empty_pages(chunk, nr);
+}
+
+/**
+ * pcpu_chunk_depopulated - post-depopulation bookkeeping
+ * @chunk: pcpu_chunk which got depopulated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been depopulated from @chunk.
+ * Update the bookkeeping information accordingly. Must be called after
+ * each successful depopulation.
+ */
+static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_clear(chunk->populated, page_start, nr);
+ chunk->nr_populated -= nr;
+ pcpu_nr_populated -= nr;
+
+ pcpu_update_empty_pages(chunk, -nr);
+}
+
+/*
+ * Chunk management implementation.
+ *
+ * To allow different implementations, chunk alloc/free and
+ * [de]population are implemented in a separate file which is pulled
+ * into this file and compiled together. The following functions
+ * should be implemented.
+ *
+ * pcpu_populate_chunk - populate the specified range of a chunk
+ * pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_create_chunk - create a new chunk
+ * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
+ * pcpu_addr_to_page - translate address to physical address
+ * pcpu_verify_alloc_info - check alloc_info is acceptable during init
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end, gfp_t gfp);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp);
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
+static struct page *pcpu_addr_to_page(void *addr);
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
+
+#ifdef CONFIG_NEED_PER_CPU_KM
+#include "percpu-km.c"
+#else
+#include "percpu-vm.c"
+#endif
+
+/**
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
+ *
+ * This is an internal function that handles all but static allocations.
+ * Static percpu address values should never be passed into the allocator.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+ /* is it in the dynamic region (first chunk)? */
+ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
+ return pcpu_first_chunk;
+
+ /* is it in the reserved region? */
+ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
+ return pcpu_reserved_chunk;
+
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += pcpu_unit_offsets[raw_smp_processor_id()];
+ return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
+ return PCPU_CHUNK_ROOT;
+
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return PCPU_CHUNK_ROOT;
+
+ if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ obj_cgroup_put(objcg);
+ return PCPU_FAIL_ALLOC;
+ }
+
+ *objcgp = objcg;
+ return PCPU_CHUNK_MEMCG;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (!objcg)
+ return;
+
+ if (chunk) {
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ size * num_possible_cpus());
+ rcu_read_unlock();
+ } else {
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_put(objcg);
+ }
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ struct obj_cgroup *objcg;
+
+ if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+ return;
+
+ objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
+
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ -(size * num_possible_cpus()));
+ rcu_read_unlock();
+
+ obj_cgroup_put(objcg);
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static enum pcpu_chunk_type
+pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+/**
+ * pcpu_alloc - the percpu allocator
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
+ * @gfp: allocation flags
+ *
+ * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
+ * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
+ * then no warning will be triggered on invalid or failed allocation
+ * requests.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
+ gfp_t gfp)
+{
+ gfp_t pcpu_gfp;
+ bool is_atomic;
+ bool do_warn;
+ enum pcpu_chunk_type type;
+ struct list_head *pcpu_slot;
+ struct obj_cgroup *objcg = NULL;
+ static int warn_limit = 10;
+ struct pcpu_chunk *chunk, *next;
+ const char *err;
+ int slot, off, cpu, ret;
+ unsigned long flags;
+ void __percpu *ptr;
+ size_t bits, bit_align;
+
+ gfp = current_gfp_context(gfp);
+ /* whitelisted flags that can be passed to the backing allocators */
+ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ do_warn = !(gfp & __GFP_NOWARN);
+
+ /*
+ * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
+ * therefore alignment must be a minimum of that many bytes.
+ * An allocation may have internal fragmentation from rounding up
+ * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
+ */
+ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
+ align = PCPU_MIN_ALLOC_SIZE;
+
+ size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
+ bits = size >> PCPU_MIN_ALLOC_SHIFT;
+ bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
+
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
+ !is_power_of_2(align))) {
+ WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
+ return NULL;
+ }
+
+ type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
+ if (unlikely(type == PCPU_FAIL_ALLOC))
+ return NULL;
+ pcpu_slot = pcpu_chunk_list(type);
+
+ if (!is_atomic) {
+ /*
+ * pcpu_balance_workfn() allocates memory under this mutex,
+ * and it may wait for memory reclaim. Allow current task
+ * to become OOM victim, in case of memory pressure.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ mutex_lock(&pcpu_alloc_mutex);
+ } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+ return NULL;
+ }
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ /* serve reserved allocations from the reserved chunk if available */
+ if (reserved && pcpu_reserved_chunk) {
+ chunk = pcpu_reserved_chunk;
+
+ off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
+ if (off < 0) {
+ err = "alloc from reserved chunk failed";
+ goto fail_unlock;
+ }
+
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
+ if (off >= 0)
+ goto area_found;
+
+ err = "alloc from reserved chunk failed";
+ goto fail_unlock;
+ }
+
+restart:
+ /* search through normal chunks */
+ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
+ off = pcpu_find_block_fit(chunk, bits, bit_align,
+ is_atomic);
+ if (off < 0) {
+ if (slot < PCPU_SLOT_FAIL_THRESHOLD)
+ pcpu_chunk_move(chunk, 0);
+ continue;
+ }
+
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
+ if (off >= 0)
+ goto area_found;
+
+ }
+ }
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /*
+ * No space left. Create a new chunk. We don't want multiple
+ * tasks to create chunks simultaneously. Serialize and create iff
+ * there's still no empty chunk after grabbing the mutex.
+ */
+ if (is_atomic) {
+ err = "atomic alloc failed, no space left";
+ goto fail;
+ }
+
+ if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
+ chunk = pcpu_create_chunk(type, pcpu_gfp);
+ if (!chunk) {
+ err = "failed to allocate new chunk";
+ goto fail;
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_relocate(chunk, -1);
+ } else {
+ spin_lock_irqsave(&pcpu_lock, flags);
+ }
+
+ goto restart;
+
+area_found:
+ pcpu_stats_area_alloc(chunk, size);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /* populate if not all pages are already there */
+ if (!is_atomic) {
+ unsigned int page_start, page_end, rs, re;
+
+ page_start = PFN_DOWN(off);
+ page_end = PFN_UP(off + size);
+
+ bitmap_for_each_clear_region(chunk->populated, rs, re,
+ page_start, page_end) {
+ WARN_ON(chunk->immutable);
+
+ ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ if (ret) {
+ pcpu_free_area(chunk, off);
+ err = "failed to populate";
+ goto fail_unlock;
+ }
+ pcpu_chunk_populated(chunk, rs, re);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
+ }
+
+ if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
+ pcpu_schedule_balance_work();
+
+ /* clear the areas and return address relative to base address */
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
+ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
+ kmemleak_alloc_percpu(ptr, size, gfp);
+
+ trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr);
+
+ pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+
+ return ptr;
+
+fail_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+fail:
+ trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+
+ if (!is_atomic && do_warn && warn_limit) {
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
+ dump_stack();
+ if (!--warn_limit)
+ pr_info("limit reached, disable warning\n");
+ }
+ if (is_atomic) {
+ /* see the flag handling in pcpu_blance_workfn() */
+ pcpu_atomic_alloc_failed = true;
+ pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
+ }
+
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+
+ return NULL;
+}
+
+/**
+ * __alloc_percpu_gfp - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ * @gfp: allocation flags
+ *
+ * Allocate zero-filled percpu area of @size bytes aligned at @align. If
+ * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
+ * be called from any context but is a lot more likely to fail. If @gfp
+ * has __GFP_NOWARN then no warning will be triggered on invalid or failed
+ * allocation requests.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
+{
+ return pcpu_alloc(size, align, false, gfp);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
+ */
+void __percpu *__alloc_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, false, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area. Might sleep.
+ * Might trigger writeouts.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, true, GFP_KERNEL);
+}
+
+/**
+ * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @type: chunk type
+ *
+ * Reclaim all fully free chunks except for the first one. This is also
+ * responsible for maintaining the pool of empty populated pages. However,
+ * it is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
+ */
+static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
+{
+ /* gfp flags passed to underlying allocators */
+ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ LIST_HEAD(to_free);
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
+ struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
+ struct pcpu_chunk *chunk, *next;
+ int slot, nr_to_pop, ret;
+
+ /*
+ * There's no reason to keep around multiple unused chunks and VM
+ * areas can be scarce. Destroy all free chunks except for one.
+ */
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
+
+ list_for_each_entry_safe(chunk, next, free_head, list) {
+ WARN_ON(chunk->immutable);
+
+ /* spare the first one */
+ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
+ continue;
+
+ list_move(&chunk->list, &to_free);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ list_for_each_entry_safe(chunk, next, &to_free, list) {
+ unsigned int rs, re;
+
+ bitmap_for_each_set_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
+ pcpu_depopulate_chunk(chunk, rs, re);
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_depopulated(chunk, rs, re);
+ spin_unlock_irq(&pcpu_lock);
+ }
+ pcpu_destroy_chunk(chunk);
+ cond_resched();
+ }
+
+ /*
+ * Ensure there are certain number of free populated pages for
+ * atomic allocs. Fill up from the most packed so that atomic
+ * allocs don't increase fragmentation. If atomic allocation
+ * failed previously, always populate the maximum amount. This
+ * should prevent atomic allocs larger than PAGE_SIZE from keeping
+ * failing indefinitely; however, large atomic allocs are not
+ * something we support properly and can be highly unreliable and
+ * inefficient.
+ */
+retry_pop:
+ if (pcpu_atomic_alloc_failed) {
+ nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+ /* best effort anyway, don't worry about synchronization */
+ pcpu_atomic_alloc_failed = false;
+ } else {
+ nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
+ pcpu_nr_empty_pop_pages[type],
+ 0, PCPU_EMPTY_POP_PAGES_HIGH);
+ }
+
+ for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+ unsigned int nr_unpop = 0, rs, re;
+
+ if (!nr_to_pop)
+ break;
+
+ spin_lock_irq(&pcpu_lock);
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ nr_unpop = chunk->nr_pages - chunk->nr_populated;
+ if (nr_unpop)
+ break;
+ }
+ spin_unlock_irq(&pcpu_lock);
+
+ if (!nr_unpop)
+ continue;
+
+ /* @chunk can't go away while pcpu_alloc_mutex is held */
+ bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
+ int nr = min_t(int, re - rs, nr_to_pop);
+
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+ if (!ret) {
+ nr_to_pop -= nr;
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_populated(chunk, rs, rs + nr);
+ spin_unlock_irq(&pcpu_lock);
+ } else {
+ nr_to_pop = 0;
+ }
+
+ if (!nr_to_pop)
+ break;
+ }
+ }
+
+ if (nr_to_pop) {
+ /* ran out of chunks to populate, create a new one and retry */
+ chunk = pcpu_create_chunk(type, gfp);
+ if (chunk) {
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_relocate(chunk, -1);
+ spin_unlock_irq(&pcpu_lock);
+ goto retry_pop;
+ }
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
+}
+
+/**
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @work: unused
+ *
+ * Call __pcpu_balance_workfn() for each chunk type.
+ */
+static void pcpu_balance_workfn(struct work_struct *work)
+{
+ enum pcpu_chunk_type type;
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ __pcpu_balance_workfn(type);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+void free_percpu(void __percpu *ptr)
+{
+ void *addr;
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int size, off;
+ bool need_balance = false;
+ struct list_head *pcpu_slot;
+
+ if (!ptr)
+ return;
+
+ kmemleak_free_percpu(ptr);
+
+ addr = __pcpu_ptr_to_addr(ptr);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ chunk = pcpu_chunk_addr_search(addr);
+ off = addr - chunk->base_addr;
+
+ size = pcpu_free_area(chunk, off);
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
+
+ pcpu_memcg_free_hook(chunk, off, size);
+
+ /* if there are more than one fully free chunks, wake up grim reaper */
+ if (chunk->free_bytes == pcpu_unit_size) {
+ struct pcpu_chunk *pos;
+
+ list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+ if (pos != chunk) {
+ need_balance = true;
+ break;
+ }
+ }
+
+ trace_percpu_free_percpu(chunk->base_addr, off, ptr);
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+#ifdef CONFIG_SMP
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
+ return true;
+ }
+ }
+#endif
+ /* on UP, can't distinguish from other static vars, always false */
+ return false;
+}
+
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
+ * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+ * @addr: the address to be converted to physical address
+ *
+ * Given @addr which is dereferenceable address obtained via one of
+ * percpu access macros, this function translates it into its physical
+ * address. The caller is responsible for ensuring @addr stays valid
+ * until this function finishes.
+ *
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be translated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
+ * RETURNS:
+ * The physical address for @addr.
+ */
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ bool in_first_chunk = false;
+ unsigned long first_low, first_high;
+ unsigned int cpu;
+
+ /*
+ * The following test on unit_low/high isn't strictly
+ * necessary but will speed up lookups of addresses which
+ * aren't in the first chunk.
+ *
+ * The address check is against full chunk sizes. pcpu_base_addr
+ * points to the beginning of the first chunk including the
+ * static region. Assumes good intent as the first chunk may
+ * not be full (ie. < pcpu_unit_pages in size).
+ */
+ first_low = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
+ first_high = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
+ if ((unsigned long)addr >= first_low &&
+ (unsigned long)addr < first_high) {
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if (addr >= start && addr < start + pcpu_unit_size) {
+ in_first_chunk = true;
+ break;
+ }
+ }
+ }
+
+ if (in_first_chunk) {
+ if (!is_vmalloc_addr(addr))
+ return __pa(addr);
+ else
+ return page_to_phys(vmalloc_to_page(addr)) +
+ offset_in_page(addr);
+ } else
+ return page_to_phys(pcpu_addr_to_page(addr)) +
+ offset_in_page(addr);
+}
+
+/**
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units. The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS. It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+ int nr_units)
+{
+ struct pcpu_alloc_info *ai;
+ size_t base_size, ai_size;
+ void *ptr;
+ int unit;
+
+ base_size = ALIGN(struct_size(ai, groups, nr_groups),
+ __alignof__(ai->groups[0].cpu_map[0]));
+ ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+ ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
+ if (!ptr)
+ return NULL;
+ ai = ptr;
+ ptr += base_size;
+
+ ai->groups[0].cpu_map = ptr;
+
+ for (unit = 0; unit < nr_units; unit++)
+ ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+ ai->nr_groups = nr_groups;
+ ai->__ai_size = PFN_ALIGN(ai_size);
+
+ return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+ memblock_free_early(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+ const struct pcpu_alloc_info *ai)
+{
+ int group_width = 1, cpu_width = 1, width;
+ char empty_str[] = "--------";
+ int alloc = 0, alloc_end = 0;
+ int group, v;
+ int upa, apl; /* units per alloc, allocs per line */
+
+ v = ai->nr_groups;
+ while (v /= 10)
+ group_width++;
+
+ v = num_possible_cpus();
+ while (v /= 10)
+ cpu_width++;
+ empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
+
+ upa = ai->alloc_size / ai->unit_size;
+ width = upa * (cpu_width + 1) + group_width + 3;
+ apl = rounddown_pow_of_two(max(60 / width, 1));
+
+ printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+ lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+ ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
+
+ for (group = 0; group < ai->nr_groups; group++) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+ int unit = 0, unit_end = 0;
+
+ BUG_ON(gi->nr_units % upa);
+ for (alloc_end += gi->nr_units / upa;
+ alloc < alloc_end; alloc++) {
+ if (!(alloc % apl)) {
+ pr_cont("\n");
+ printk("%spcpu-alloc: ", lvl);
+ }
+ pr_cont("[%0*d] ", group_width, group);
+
+ for (unit_end += upa; unit < unit_end; unit++)
+ if (gi->cpu_map[unit] != NR_CPUS)
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
+ else
+ pr_cont("%s ", empty_str);
+ }
+ }
+ pr_cont("\n");
+}
+
+/**
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
+ * @base_addr: mapped address
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * percpu area. This function is to be called from arch percpu area
+ * setup path.
+ *
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk. This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation. This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk. The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
+ *
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
+ *
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
+ *
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size. This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas. Units which should be colocated are put into the
+ * same group. Dynamic VM areas will be allocated according to these
+ * groupings. If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
+ *
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
+ *
+ * The first chunk will always contain a static and a dynamic region.
+ * However, the static region is not managed by any chunk. If the first
+ * chunk also contains a reserved region, it is served by two chunks -
+ * one for the reserved region and one for the dynamic region. They
+ * share the same vm, but use offset regions in the area allocation map.
+ * The chunk serving the dynamic region is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunk.
+ */
+void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
+{
+ size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ size_t static_size, dyn_size;
+ struct pcpu_chunk *chunk;
+ unsigned long *group_offsets;
+ size_t *group_sizes;
+ unsigned long *unit_off;
+ unsigned int cpu;
+ int *unit_map;
+ int group, unit, i;
+ int map_size;
+ unsigned long tmp_addr;
+ size_t alloc_size;
+ enum pcpu_chunk_type type;
+
+#define PCPU_SETUP_BUG_ON(cond) do { \
+ if (unlikely(cond)) { \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
+ cpumask_pr_args(cpu_possible_mask)); \
+ pcpu_dump_alloc_info(KERN_EMERG, ai); \
+ BUG(); \
+ } \
+} while (0)
+
+ /* sanity checks */
+ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
+ PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
+#endif
+ PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
+ PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
+ PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
+ PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
+ PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(!ai->dyn_size);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
+ PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
+ IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
+ PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
+
+ /* process group information and build config tables accordingly */
+ alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
+ group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_offsets)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
+ group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_sizes)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
+ unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
+ unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_off)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ unit_map[cpu] = UINT_MAX;
+
+ pcpu_low_unit_cpu = NR_CPUS;
+ pcpu_high_unit_cpu = NR_CPUS;
+
+ for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+
+ group_offsets[group] = gi->base_offset;
+ group_sizes[group] = gi->nr_units * ai->unit_size;
+
+ for (i = 0; i < gi->nr_units; i++) {
+ cpu = gi->cpu_map[i];
+ if (cpu == NR_CPUS)
+ continue;
+
+ PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+ PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
+
+ unit_map[cpu] = unit + i;
+ unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
+ /* determine low/high unit_cpu */
+ if (pcpu_low_unit_cpu == NR_CPUS ||
+ unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+ pcpu_low_unit_cpu = cpu;
+ if (pcpu_high_unit_cpu == NR_CPUS ||
+ unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+ pcpu_high_unit_cpu = cpu;
+ }
+ }
+ pcpu_nr_units = unit;
+
+ for_each_possible_cpu(cpu)
+ PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+
+ /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
+
+ pcpu_nr_groups = ai->nr_groups;
+ pcpu_group_offsets = group_offsets;
+ pcpu_group_sizes = group_sizes;
+ pcpu_unit_map = unit_map;
+ pcpu_unit_offsets = unit_off;
+
+ /* determine basic parameters */
+ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
+ pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
+ pcpu_atom_size = ai->atom_size;
+ pcpu_chunk_struct_size = struct_size(chunk, populated,
+ BITS_TO_LONGS(pcpu_unit_pages));
+
+ pcpu_stats_save_ai(ai);
+
+ /*
+ * Allocate chunk slots. The additional last slot is for
+ * empty chunks.
+ */
+ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+ pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES,
+ SMP_CACHE_BYTES);
+ if (!pcpu_chunk_lists)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES);
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
+
+ /*
+ * The end of the static region needs to be aligned with the
+ * minimum allocation size as this offsets the reserved and
+ * dynamic region. The first chunk ends page aligned by
+ * expanding the dynamic region, therefore the dynamic region
+ * can be shrunk to compensate while still staying above the
+ * configured sizes.
+ */
+ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
+ dyn_size = ai->dyn_size - (static_size - ai->static_size);
+
+ /*
+ * Initialize first chunk.
+ * If the reserved_size is non-zero, this initializes the reserved
+ * chunk. If the reserved_size is zero, the reserved chunk is NULL
+ * and the dynamic region is initialized here. The first chunk,
+ * pcpu_first_chunk, will always point to the chunk that serves
+ * the dynamic region.
+ */
+ tmp_addr = (unsigned long)base_addr + static_size;
+ map_size = ai->reserved_size ?: dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
+
+ /* init dynamic chunk if necessary */
+ if (ai->reserved_size) {
+ pcpu_reserved_chunk = chunk;
+
+ tmp_addr = (unsigned long)base_addr + static_size +
+ ai->reserved_size;
+ map_size = dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
+ }
+
+ /* link the first chunk in */
+ pcpu_first_chunk = chunk;
+ pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
+ pcpu_chunk_relocate(pcpu_first_chunk, -1);
+
+ /* include all regions of the first chunk */
+ pcpu_nr_populated += PFN_DOWN(size_sum);
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(base_addr);
+
+ /* we're done */
+ pcpu_base_addr = base_addr;
+}
+
+#ifdef CONFIG_SMP
+
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
+ [PCPU_FC_AUTO] = "auto",
+ [PCPU_FC_EMBED] = "embed",
+ [PCPU_FC_PAGE] = "page",
+};
+
+enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
+
+static int __init percpu_alloc_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (0)
+ /* nada */;
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
+ else if (!strcmp(str, "embed"))
+ pcpu_chosen_fc = PCPU_FC_EMBED;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ else if (!strcmp(str, "page"))
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ else
+ pr_warn("unknown allocator %s specified\n", str);
+
+ return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always multiples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, best_upa; /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accommodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ /* determine the maximum # of units that can fit in an allocation */
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ }
+
+ /*
+ * Wasted space is caused by a ratio imbalance of upa to group_cnt.
+ * Expand the unit_size until we use >= 75% of the units allocated.
+ * Related to atom_size, which could be much larger than the unit_size.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 1/3. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group < nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+
+#if defined(BUILD_EMBED_FIRST_CHUNK)
+/**
+ * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ * @alloc_fn: function to allocate percpu page
+ * @free_fn: function to free percpu page
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * If this function is used to setup the first chunk, it is allocated
+ * by calling @alloc_fn and used as-is without being mapped into
+ * vmalloc area. Allocations are always whole multiples of @atom_size
+ * aligned to @atom_size.
+ *
+ * This enables the first chunk to piggy back on the linear physical
+ * mapping which often uses larger page size. Please note that this
+ * can result in very sparse cpu->unit mapping on NUMA machines thus
+ * requiring large vmalloc address space. Don't use this allocator if
+ * vmalloc space is not orders of magnitude larger than distances
+ * between node memory addresses (ie. 32bit NUMA machines).
+ *
+ * @dyn_size specifies the minimum dynamic area size.
+ *
+ * If the needed size is smaller than the minimum or specified unit
+ * size, the leftover is returned using @free_fn.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn)
+{
+ void *base = (void *)ULONG_MAX;
+ void **areas = NULL;
+ struct pcpu_alloc_info *ai;
+ size_t size_sum, areas_size;
+ unsigned long max_distance;
+ int group, i, highest_group, rc = 0;
+
+ ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
+ cpu_distance_fn);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+
+ size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
+
+ areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
+ if (!areas) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ /* allocate, copy and determine base address & max_distance */
+ highest_group = 0;
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ unsigned int cpu = NR_CPUS;
+ void *ptr;
+
+ for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
+ cpu = gi->cpu_map[i];
+ BUG_ON(cpu == NR_CPUS);
+
+ /* allocate space for the whole group */
+ ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ if (!ptr) {
+ rc = -ENOMEM;
+ goto out_free_areas;
+ }
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
+ areas[group] = ptr;
+
+ base = min(ptr, base);
+ if (ptr > areas[highest_group])
+ highest_group = group;
+ }
+ max_distance = areas[highest_group] - base;
+ max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+ pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free_areas;
+#endif
+ }
+
+ /*
+ * Copy data and free unused parts. This should happen after all
+ * allocations are complete; otherwise, we may end up with
+ * overlapping groups.
+ */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ void *ptr = areas[group];
+
+ for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
+ if (gi->cpu_map[i] == NR_CPUS) {
+ /* unused unit, free whole */
+ free_fn(ptr, ai->unit_size);
+ continue;
+ }
+ /* copy and return the unused part */
+ memcpy(ptr, __per_cpu_load, ai->static_size);
+ free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ }
+ }
+
+ /* base address is now known, determine group base offsets */
+ for (group = 0; group < ai->nr_groups; group++) {
+ ai->groups[group].base_offset = areas[group] - base;
+ }
+
+ pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
+ ai->dyn_size, ai->unit_size);
+
+ pcpu_setup_first_chunk(ai, base);
+ goto out_free;
+
+out_free_areas:
+ for (group = 0; group < ai->nr_groups; group++)
+ if (areas[group])
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
+out_free:
+ pcpu_free_alloc_info(ai);
+ if (areas)
+ memblock_free_early(__pa(areas), areas_size);
+ return rc;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK */
+
+#ifdef BUILD_PAGE_FIRST_CHUNK
+/**
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_page_first_chunk(size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ struct pcpu_alloc_info *ai;
+ char psize_str[16];
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ int unit, i, j, rc = 0;
+ int upa;
+ int nr_g0_units;
+
+ snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
+ ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+ BUG_ON(ai->nr_groups != 1);
+ upa = ai->alloc_size/ai->unit_size;
+ nr_g0_units = roundup(num_possible_cpus(), upa);
+ if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
+ pcpu_free_alloc_info(ai);
+ return -EINVAL;
+ }
+
+ unit_pages = ai->unit_size >> PAGE_SHIFT;
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+ sizeof(pages[0]));
+ pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
+ if (!pages)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pages_size);
+
+ /* allocate pages */
+ j = 0;
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
+ for (i = 0; i < unit_pages; i++) {
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ if (!ptr) {
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
+ goto enomem;
+ }
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
+ pages[j++] = virt_to_page(ptr);
+ }
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * ai->unit_size;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned long unit_addr =
+ (unsigned long)vm.addr + unit * ai->unit_size;
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+ unit_pages);
+ if (rc < 0)
+ panic("failed to map percpu area, err=%d\n", rc);
+
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ }
+
+ /* we're ready, commit */
+ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
+ unit_pages, psize_str, ai->static_size,
+ ai->reserved_size, ai->dyn_size);
+
+ pcpu_setup_first_chunk(ai, vm.addr);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ rc = -ENOMEM;
+out_free_ar:
+ memblock_free_early(__pa(pages), pages_size);
+ pcpu_free_alloc_info(ai);
+ return rc;
+}
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+/*
+ * Generic SMP percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
+ size_t align)
+{
+ return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
+}
+
+static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
+{
+ memblock_free_early(__pa(ptr), size);
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long delta;
+ unsigned int cpu;
+ int rc;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
+ pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ if (rc < 0)
+ panic("Failed to initialize percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#else /* CONFIG_SMP */
+
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+ const size_t unit_size =
+ roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+ PERCPU_DYNAMIC_RESERVE));
+ struct pcpu_alloc_info *ai;
+ void *fc;
+
+ ai = pcpu_alloc_alloc_info(1, 1);
+ fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!ai || !fc)
+ panic("Failed to allocate memory for percpu areas.");
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(fc);
+
+ ai->dyn_size = unit_size;
+ ai->unit_size = unit_size;
+ ai->atom_size = unit_size;
+ ai->alloc_size = unit_size;
+ ai->groups[0].nr_units = 1;
+ ai->groups[0].cpu_map[0] = 0;
+
+ pcpu_setup_first_chunk(ai, fc);
+ pcpu_free_alloc_info(ai);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks. Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata. It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
+ */
+unsigned long pcpu_nr_pages(void)
+{
+ return pcpu_nr_populated * pcpu_nr_units;
+}
+
+/*
+ * Percpu allocator is initialized early during boot when neither slab or
+ * workqueue is available. Plug async management until everything is up
+ * and running.
+ */
+static int __init percpu_enable_async(void)
+{
+ pcpu_async_enabled = true;
+ return 0;
+}
+subsys_initcall(percpu_enable_async);
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
new file mode 100644
index 000000000..1dcc86502
--- /dev/null
+++ b/mm/pgalloc-track.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLLC_TRACK_H
+#define _LINUX_PGALLLC_TRACK_H
+
+#if defined(CONFIG_MMU)
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pgd_none(*pgd))) {
+ if (__p4d_alloc(mm, pgd, address))
+ return NULL;
+ *mod_mask |= PGTBL_PGD_MODIFIED;
+ }
+
+ return p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(p4d_none(*p4d))) {
+ if (__pud_alloc(mm, p4d, address))
+ return NULL;
+ *mod_mask |= PGTBL_P4D_MODIFIED;
+ }
+
+ return pud_offset(p4d, address);
+}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pud_none(*pud))) {
+ if (__pmd_alloc(mm, pud, address))
+ return NULL;
+ *mod_mask |= PGTBL_PUD_MODIFIED;
+ }
+
+ return pmd_offset(pud, address);
+}
+#endif /* CONFIG_MMU */
+
+#define pte_alloc_kernel_track(pmd, address, mask) \
+ ((unlikely(pmd_none(*(pmd))) && \
+ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ NULL: pte_offset_kernel(pmd, address))
+
+#endif /* _LINUX_PGALLLC_TRACK_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000..4e640baf9
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in linux/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
+#include <asm/tlb.h>
+
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none. Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+}
+
+#ifndef __PAGETABLE_P4D_FOLDED
+void p4d_clear_bad(p4d_t *p4d)
+{
+ p4d_ERROR(*p4d);
+ p4d_clear(p4d);
+}
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+void pud_clear_bad(pud_t *pud)
+{
+ pud_ERROR(*pud);
+ pud_clear(pud);
+}
+#endif
+
+/*
+ * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
+ * above. pmd folding is special and typically pmd_* macros refer to upper
+ * level even when folded
+ */
+void pmd_clear_bad(pmd_t *pmd)
+{
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+}
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_fix_spurious_fault(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ struct mm_struct *mm = (vma)->vm_mm;
+ pte_t pte;
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ pud_t pud;
+
+ VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+ VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return pud;
+}
+#endif
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
+ struct page, lru);
+ if (pmd_huge_pte(mm, pmdp))
+ list_del(&pgtable->lru);
+ return pgtable;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
+ flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return old;
+}
+#endif
+
+#ifndef pmdp_collapse_flush
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ /*
+ * pmd and hugepage pte format are same. So we could
+ * use the same function.
+ */
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+
+ /* collapse entails shooting down ptes not pmd */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000..c90d722c6
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ */
+
+#include <linux/compat.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/sched/mm.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @pages: array of pointers to pages we want to copy
+ * @offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @iter: where to copy to/from locally
+ * @vm_write: 0 means copy from, 1 means copy to
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct page **pages,
+ unsigned offset,
+ size_t len,
+ struct iov_iter *iter,
+ int vm_write)
+{
+ /* Do the copy for each page */
+ while (len && iov_iter_count(iter)) {
+ struct page *page = *pages++;
+ size_t copy = PAGE_SIZE - offset;
+ size_t copied;
+
+ if (copy > len)
+ copy = len;
+
+ if (vm_write)
+ copied = copy_page_from_iter(page, offset, copy, iter);
+ else
+ copied = copy_page_to_iter(page, offset, copy, iter);
+
+ len -= copied;
+ if (copied < copy && iov_iter_count(iter))
+ return -EFAULT;
+ offset = 0;
+ }
+ return 0;
+}
+
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @iter: where to copy to/from locally
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+ unsigned long len,
+ struct iov_iter *iter,
+ struct page **process_pages,
+ struct mm_struct *mm,
+ struct task_struct *task,
+ int vm_write)
+{
+ unsigned long pa = addr & PAGE_MASK;
+ unsigned long start_offset = addr - pa;
+ unsigned long nr_pages;
+ ssize_t rc = 0;
+ unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+ / sizeof(struct pages *);
+ unsigned int flags = 0;
+
+ /* Work out address and page range required */
+ if (len == 0)
+ return 0;
+ nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+ if (vm_write)
+ flags |= FOLL_WRITE;
+
+ while (!rc && nr_pages && iov_iter_count(iter)) {
+ int pinned_pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
+ size_t bytes;
+
+ /*
+ * Get the pages we're interested in. We must
+ * access remotely because task/mm might not
+ * current/current->mm
+ */
+ mmap_read_lock(mm);
+ pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
+ flags, process_pages,
+ NULL, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ if (pinned_pages <= 0)
+ return -EFAULT;
+
+ bytes = pinned_pages * PAGE_SIZE - start_offset;
+ if (bytes > len)
+ bytes = len;
+
+ rc = process_vm_rw_pages(process_pages,
+ start_offset, bytes, iter,
+ vm_write);
+ len -= bytes;
+ start_offset = 0;
+ nr_pages -= pinned_pages;
+ pa += pinned_pages * PAGE_SIZE;
+
+ /* If vm_write is set, the pages need to be made dirty: */
+ unpin_user_pages_dirty_lock(process_pages, pinned_pages,
+ vm_write);
+ }
+
+ return rc;
+}
+
+/* Maximum number of entries for process pages array
+ which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @iter: where to copy to/from locally
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ *
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
+ const struct iovec *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct task_struct *task;
+ struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+ struct page **process_pages = pp_stack;
+ struct mm_struct *mm;
+ unsigned long i;
+ ssize_t rc = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_pages_iov;
+ ssize_t iov_len;
+ size_t total_len = iov_iter_count(iter);
+
+ /*
+ * Work out how many pages of struct pages we're going to need
+ * when eventually calling get_user_pages
+ */
+ for (i = 0; i < riovcnt; i++) {
+ iov_len = rvec[i].iov_len;
+ if (iov_len > 0) {
+ nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ + iov_len)
+ / PAGE_SIZE - (unsigned long)rvec[i].iov_base
+ / PAGE_SIZE + 1;
+ nr_pages = max(nr_pages, nr_pages_iov);
+ }
+ }
+
+ if (nr_pages == 0)
+ return 0;
+
+ if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+ /* For reliability don't try to kmalloc more than
+ 2 pages worth */
+ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+ sizeof(struct pages *)*nr_pages),
+ GFP_KERNEL);
+
+ if (!process_pages)
+ return -ENOMEM;
+ }
+
+ /* Get process information */
+ task = find_get_task_by_vpid(pid);
+ if (!task) {
+ rc = -ESRCH;
+ goto free_proc_pages;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (!mm || IS_ERR(mm)) {
+ rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ /*
+ * Explicitly map EACCES to EPERM as EPERM is a more
+ * appropriate error code for process_vw_readv/writev
+ */
+ if (rc == -EACCES)
+ rc = -EPERM;
+ goto put_task_struct;
+ }
+
+ for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
+ rc = process_vm_rw_single_vec(
+ (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+ iter, process_pages, mm, task, vm_write);
+
+ /* copied = space before - space after */
+ total_len -= iov_iter_count(iter);
+
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (total_len)
+ rc = total_len;
+
+ mmput(mm);
+
+put_task_struct:
+ put_task_struct(task);
+
+free_proc_pages:
+ if (process_pages != pp_stack)
+ kfree(process_pages);
+ return rc;
+}
+
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ *
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+ const struct iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
+ ssize_t rc;
+ int dir = vm_write ? WRITE : READ;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* Check iovecs */
+ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+ if (rc < 0)
+ return rc;
+ if (!iov_iter_count(&iter))
+ goto free_iov_l;
+ iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
+ in_compat_syscall());
+ if (IS_ERR(iov_r)) {
+ rc = PTR_ERR(iov_r);
+ goto free_iov_l;
+ }
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+free_iov_l:
+ kfree(iov_l);
+ return rc;
+}
+
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+ const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
diff --git a/mm/ptdump.c b/mm/ptdump.c
new file mode 100644
index 000000000..a917bf55c
--- /dev/null
+++ b/mm/ptdump.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pagewalk.h>
+#include <linux/ptdump.h>
+#include <linux/kasan.h>
+
+#ifdef CONFIG_KASAN
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_early_shadow_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline int note_kasan_page_table(struct mm_walk *walk,
+ unsigned long addr)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+#endif
+
+static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pgd_t val = READ_ONCE(*pgd);
+
+#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+ if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (st->effective_prot)
+ st->effective_prot(st, 0, pgd_val(val));
+
+ if (pgd_leaf(val))
+ st->note_page(st, addr, 0, pgd_val(val));
+
+ return 0;
+}
+
+static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ p4d_t val = READ_ONCE(*p4d);
+
+#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+ if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (st->effective_prot)
+ st->effective_prot(st, 1, p4d_val(val));
+
+ if (p4d_leaf(val))
+ st->note_page(st, addr, 1, p4d_val(val));
+
+ return 0;
+}
+
+static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pud_t val = READ_ONCE(*pud);
+
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+ if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (st->effective_prot)
+ st->effective_prot(st, 2, pud_val(val));
+
+ if (pud_leaf(val))
+ st->note_page(st, addr, 2, pud_val(val));
+
+ return 0;
+}
+
+static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pmd_t val = READ_ONCE(*pmd);
+
+#if defined(CONFIG_KASAN)
+ if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (st->effective_prot)
+ st->effective_prot(st, 3, pmd_val(val));
+ if (pmd_leaf(val))
+ st->note_page(st, addr, 3, pmd_val(val));
+
+ return 0;
+}
+
+static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pte_t val = ptep_get(pte);
+
+ if (st->effective_prot)
+ st->effective_prot(st, 4, pte_val(val));
+
+ st->note_page(st, addr, 4, pte_val(val));
+
+ return 0;
+}
+
+static int ptdump_hole(unsigned long addr, unsigned long next,
+ int depth, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, depth, 0);
+
+ return 0;
+}
+
+static const struct mm_walk_ops ptdump_ops = {
+ .pgd_entry = ptdump_pgd_entry,
+ .p4d_entry = ptdump_p4d_entry,
+ .pud_entry = ptdump_pud_entry,
+ .pmd_entry = ptdump_pmd_entry,
+ .pte_entry = ptdump_pte_entry,
+ .pte_hole = ptdump_hole,
+};
+
+void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
+{
+ const struct ptdump_range *range = st->range;
+
+ mmap_write_lock(mm);
+ while (range->start != range->end) {
+ walk_page_range_novma(mm, range->start, range->end,
+ &ptdump_ops, pgd, st);
+ range++;
+ }
+ mmap_write_unlock(mm);
+
+ /* Flush out the last page */
+ st->note_page(st, 0, -1, 0);
+}
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 000000000..d30bcf4bc
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/dax.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
+#include <linux/mm_inline.h>
+#include <linux/blk-cgroup.h>
+#include <linux/fadvise.h>
+#include <linux/sched/mm.h>
+
+#include "internal.h"
+
+/*
+ * Initialise a struct file's readahead state. Assumes that the caller has
+ * memset *ra to zero.
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+ ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
+ ra->prev_pos = -1;
+}
+EXPORT_SYMBOL_GPL(file_ra_state_init);
+
+/*
+ * see if a page needs releasing upon read_cache_pages() failure
+ * - the caller of read_cache_pages() may have set PG_private or PG_fscache
+ * before calling, such as the NFS fs marking pages that are cached locally
+ * on disk, thus we need to give the fs a chance to clean up in the event of
+ * an error
+ */
+static void read_cache_pages_invalidate_page(struct address_space *mapping,
+ struct page *page)
+{
+ if (page_has_private(page)) {
+ if (!trylock_page(page))
+ BUG();
+ page->mapping = mapping;
+ do_invalidatepage(page, 0, PAGE_SIZE);
+ page->mapping = NULL;
+ unlock_page(page);
+ }
+ put_page(page);
+}
+
+/*
+ * release a list of pages, invalidating them first if need be
+ */
+static void read_cache_pages_invalidate_pages(struct address_space *mapping,
+ struct list_head *pages)
+{
+ struct page *victim;
+
+ while (!list_empty(pages)) {
+ victim = lru_to_page(pages);
+ list_del(&victim->lru);
+ read_cache_pages_invalidate_page(mapping, victim);
+ }
+}
+
+/**
+ * read_cache_pages - populate an address space with some pages & start reads against them
+ * @mapping: the address_space
+ * @pages: The address of a list_head which contains the target pages. These
+ * pages have their ->index populated and are otherwise uninitialised.
+ * @filler: callback routine for filling a single page.
+ * @data: private data for the callback routine.
+ *
+ * Hides the details of the LRU cache etc from the filesystems.
+ *
+ * Returns: %0 on success, error return by @filler otherwise
+ */
+int read_cache_pages(struct address_space *mapping, struct list_head *pages,
+ int (*filler)(void *, struct page *), void *data)
+{
+ struct page *page;
+ int ret = 0;
+
+ while (!list_empty(pages)) {
+ page = lru_to_page(pages);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ readahead_gfp_mask(mapping))) {
+ read_cache_pages_invalidate_page(mapping, page);
+ continue;
+ }
+ put_page(page);
+
+ ret = filler(data, page);
+ if (unlikely(ret)) {
+ read_cache_pages_invalidate_pages(mapping, pages);
+ break;
+ }
+ task_io_account_read(PAGE_SIZE);
+ }
+ return ret;
+}
+
+EXPORT_SYMBOL(read_cache_pages);
+
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+ bool skip_page)
+{
+ const struct address_space_operations *aops = rac->mapping->a_ops;
+ struct page *page;
+ struct blk_plug plug;
+
+ if (!readahead_count(rac))
+ goto out;
+
+ blk_start_plug(&plug);
+
+ if (aops->readahead) {
+ aops->readahead(rac);
+ /* Clean up the remaining pages */
+ while ((page = readahead_page(rac))) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else if (aops->readpages) {
+ aops->readpages(rac->file, rac->mapping, pages,
+ readahead_count(rac));
+ /* Clean up the remaining pages */
+ put_pages_list(pages);
+ rac->_index += rac->_nr_pages;
+ rac->_nr_pages = 0;
+ } else {
+ while ((page = readahead_page(rac))) {
+ aops->readpage(rac->file, page);
+ put_page(page);
+ }
+ }
+
+ blk_finish_plug(&plug);
+
+ BUG_ON(!list_empty(pages));
+ BUG_ON(readahead_count(rac));
+
+out:
+ if (skip_page)
+ rac->_index++;
+}
+
+/**
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
+ *
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size. This is almost certainly
+ * not the function you want to call. Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller. Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
+ */
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
+{
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
+ LIST_HEAD(page_pool);
+ gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ unsigned long i;
+
+ /*
+ * Partway through the readahead operation, we will have added
+ * locked pages to the page cache, but will not yet have submitted
+ * them for I/O. Adding another page may need to allocate memory,
+ * which can trigger memory reclaim. Telling the VM we're in
+ * the middle of a filesystem operation will cause it to not
+ * touch file-backed pages, preventing a deadlock. Most (all?)
+ * filesystems already specify __GFP_NOFS in their mapping's
+ * gfp_mask, but let's be explicit here.
+ */
+ unsigned int nofs = memalloc_nofs_save();
+
+ /*
+ * Preallocate as many pages as we will need.
+ */
+ for (i = 0; i < nr_to_read; i++) {
+ struct page *page = xa_load(&mapping->i_pages, index + i);
+
+ BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
+
+ if (page && !xa_is_value(page)) {
+ /*
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch. This page may be the one we would
+ * have intended to mark as Readahead, but we don't
+ * have a stable reference to this page, and it's
+ * not worth getting one just for that.
+ */
+ read_pages(ractl, &page_pool, true);
+ continue;
+ }
+
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ break;
+ if (mapping->a_ops->readpages) {
+ page->index = index + i;
+ list_add(&page->lru, &page_pool);
+ } else if (add_to_page_cache_lru(page, mapping, index + i,
+ gfp_mask) < 0) {
+ put_page(page);
+ read_pages(ractl, &page_pool, true);
+ continue;
+ }
+ if (i == nr_to_read - lookahead_size)
+ SetPageReadahead(page);
+ ractl->_nr_pages++;
+ }
+
+ /*
+ * Now start the IO. We ignore I/O errors - if the page is not
+ * uptodate then the caller will launch readpage again, and
+ * will then handle the error.
+ */
+ read_pages(ractl, &page_pool, false);
+ memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
+
+/*
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
+{
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
+ loff_t isize = i_size_read(inode);
+ pgoff_t end_index; /* The last page we want to read */
+
+ if (isize == 0)
+ return;
+
+ end_index = (isize - 1) >> PAGE_SHIFT;
+ if (index > end_index)
+ return;
+ /* Don't read past the page containing the last byte of the file */
+ if (nr_to_read > end_index - index)
+ nr_to_read = end_index - index + 1;
+
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
+}
+
+/*
+ * Chunk the readahead into 2 megabyte units, so that we don't pin too much
+ * memory at once.
+ */
+void force_page_cache_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long nr_to_read)
+{
+ struct address_space *mapping = ractl->mapping;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ unsigned long max_pages, index;
+
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+ !mapping->a_ops->readahead))
+ return;
+
+ /*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ index = readahead_index(ractl);
+ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
+ while (nr_to_read) {
+ unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
+
+ if (this_chunk > nr_to_read)
+ this_chunk = nr_to_read;
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
+
+ index += this_chunk;
+ nr_to_read -= this_chunk;
+ }
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+ unsigned long newsize = roundup_pow_of_two(size);
+
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
+ else if (newsize <= max / 4)
+ newsize = newsize * 2;
+ else
+ newsize = max;
+
+ return newsize;
+}
+
+/*
+ * Get the previous window size, ramp it up, and
+ * return it as the new window size.
+ */
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
+ unsigned long max)
+{
+ unsigned long cur = ra->size;
+
+ if (cur < max / 16)
+ return 4 * cur;
+ if (cur <= max / 2)
+ return 2 * cur;
+ return max;
+}
+
+/*
+ * On-demand readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * |<----- async_size ---------|
+ * |------------------- size -------------------->|
+ * |==================#===========================|
+ * ^start ^page marked with PG_readahead
+ *
+ * To overlap application thinking time and disk I/O time, we do
+ * `readahead pipelining': Do not wait until the application consumed all
+ * readahead pages and stalled on the missing page at readahead_index;
+ * Instead, submit an asynchronous readahead I/O as soon as there are
+ * only async_size pages left in the readahead window. Normally async_size
+ * will be equal to size, for maximum pipelining.
+ *
+ * In interleaved sequential reads, concurrent streams on the same fd can
+ * be invalidating each other's readahead state. So we flag the new readahead
+ * page at (start+size-async_size) with PG_readahead, and use it as readahead
+ * indicator. The flag won't be set on already cached pages, to avoid the
+ * readahead-for-nothing fuss, saving pointless page cache lookups.
+ *
+ * prev_pos tracks the last visited byte in the _previous_ read request.
+ * It should be maintained by the caller, and will be used for detecting
+ * small random reads. Note that the readahead algorithm checks loosely
+ * for sequential patterns. Hence interleaved reads might be served as
+ * sequential ones.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * The code ramps up the readahead size aggressively at first, but slow down as
+ * it approaches max_readhead.
+ */
+
+/*
+ * Count contiguously cached pages from @index-1 to @index-@max,
+ * this count is a conservative estimation of
+ * - length of the sequential read sequence, or
+ * - thrashing threshold in memory tight systems
+ */
+static pgoff_t count_history_pages(struct address_space *mapping,
+ pgoff_t index, unsigned long max)
+{
+ pgoff_t head;
+
+ rcu_read_lock();
+ head = page_cache_prev_miss(mapping, index - 1, max);
+ rcu_read_unlock();
+
+ return index - 1 - head;
+}
+
+/*
+ * page cache context based read-ahead
+ */
+static int try_context_readahead(struct address_space *mapping,
+ struct file_ra_state *ra,
+ pgoff_t index,
+ unsigned long req_size,
+ unsigned long max)
+{
+ pgoff_t size;
+
+ size = count_history_pages(mapping, index, max);
+
+ /*
+ * not enough history pages:
+ * it could be a random read
+ */
+ if (size <= req_size)
+ return 0;
+
+ /*
+ * starts from beginning of file:
+ * it is a strong indication of long-run stream (or whole-file-read)
+ */
+ if (size >= index)
+ size *= 2;
+
+ ra->start = index;
+ ra->size = min(size + req_size, max);
+ ra->async_size = 1;
+
+ return 1;
+}
+
+/*
+ * A minimal readahead algorithm for trivial sequential/random reads.
+ */
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct file_ra_state *ra, bool hit_readahead_marker,
+ unsigned long req_size)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+ unsigned long max_pages = ra->ra_pages;
+ unsigned long add_pages;
+ unsigned long index = readahead_index(ractl);
+ pgoff_t prev_index;
+
+ /*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ if (req_size > max_pages && bdi->io_pages > max_pages)
+ max_pages = min(req_size, bdi->io_pages);
+
+ /*
+ * start of file
+ */
+ if (!index)
+ goto initial_readahead;
+
+ /*
+ * It's the expected callback index, assume sequential access.
+ * Ramp up sizes, and push forward the readahead window.
+ */
+ if ((index == (ra->start + ra->size - ra->async_size) ||
+ index == (ra->start + ra->size))) {
+ ra->start += ra->size;
+ ra->size = get_next_ra_size(ra, max_pages);
+ ra->async_size = ra->size;
+ goto readit;
+ }
+
+ /*
+ * Hit a marked page without valid readahead state.
+ * E.g. interleaved reads.
+ * Query the pagecache for async_size, which normally equals to
+ * readahead size. Ramp it up and use it as the new readahead size.
+ */
+ if (hit_readahead_marker) {
+ pgoff_t start;
+
+ rcu_read_lock();
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
+ rcu_read_unlock();
+
+ if (!start || start - index > max_pages)
+ return;
+
+ ra->start = start;
+ ra->size = start - index; /* old async_size */
+ ra->size += req_size;
+ ra->size = get_next_ra_size(ra, max_pages);
+ ra->async_size = ra->size;
+ goto readit;
+ }
+
+ /*
+ * oversize read
+ */
+ if (req_size > max_pages)
+ goto initial_readahead;
+
+ /*
+ * sequential cache miss
+ * trivial case: (index - prev_index) == 1
+ * unaligned reads: (index - prev_index) == 0
+ */
+ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+ if (index - prev_index <= 1UL)
+ goto initial_readahead;
+
+ /*
+ * Query the page cache and look for the traces(cached history pages)
+ * that a sequential stream would leave behind.
+ */
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
+ goto readit;
+
+ /*
+ * standalone, small random read
+ * Read as is, and do not pollute the readahead state.
+ */
+ do_page_cache_ra(ractl, req_size, 0);
+ return;
+
+initial_readahead:
+ ra->start = index;
+ ra->size = get_init_ra_size(req_size, max_pages);
+ ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
+
+readit:
+ /*
+ * Will this read hit the readahead marker made by itself?
+ * If so, trigger the readahead marker hit now, and merge
+ * the resulted next readahead window into the current one.
+ * Take care of maximum IO pages as above.
+ */
+ if (index == ra->start && ra->size == ra->async_size) {
+ add_pages = get_next_ra_size(ra, max_pages);
+ if (ra->size + add_pages <= max_pages) {
+ ra->async_size = add_pages;
+ ra->size += add_pages;
+ } else {
+ ra->size = max_pages;
+ ra->async_size = max_pages >> 1;
+ }
+ }
+
+ ractl->_index = ra->start;
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
+}
+
+void page_cache_sync_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long req_count)
+{
+ bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
+
+ /*
+ * Even if read-ahead is disabled, issue this request as read-ahead
+ * as we'll need it to satisfy the requested range. The forced
+ * read-ahead will do the right thing and limit the read to just the
+ * requested range, which we'll set to 1 page for this case.
+ */
+ if (!ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->file)
+ return;
+ req_count = 1;
+ do_forced_ra = true;
+ }
+
+ /* be dumb */
+ if (do_forced_ra) {
+ force_page_cache_ra(ractl, ra, req_count);
+ return;
+ }
+
+ /* do read-ahead */
+ ondemand_readahead(ractl, ra, false, req_count);
+}
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
+
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, struct page *page,
+ unsigned long req_count)
+{
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /*
+ * Same bit is used for PG_readahead and PG_reclaim.
+ */
+ if (PageWriteback(page))
+ return;
+
+ ClearPageReadahead(page);
+
+ /*
+ * Defer asynchronous read-ahead on IO congestion.
+ */
+ if (inode_read_congested(ractl->mapping->host))
+ return;
+
+ if (blk_cgroup_congested())
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(ractl, ra, true, req_count);
+}
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
+
+ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
+{
+ ssize_t ret;
+ struct fd f;
+
+ ret = -EBADF;
+ f = fdget(fd);
+ if (!f.file || !(f.file->f_mode & FMODE_READ))
+ goto out;
+
+ /*
+ * The readahead() syscall is intended to run only on files
+ * that can execute readahead. If readahead is not possible
+ * on this file, then we must return -EINVAL.
+ */
+ ret = -EINVAL;
+ if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
+ (!S_ISREG(file_inode(f.file)->i_mode) &&
+ !S_ISBLK(file_inode(f.file)->i_mode)))
+ goto out;
+
+ ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+out:
+ fdput(f);
+ return ret;
+}
+
+SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
+{
+ return ksys_readahead(fd, offset, count);
+}
diff --git a/mm/rmap.c b/mm/rmap.c
new file mode 100644
index 000000000..e6f840be1
--- /dev/null
+++ b/mm/rmap.c
@@ -0,0 +1,2019 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
+ * Released under the General Public License (GPL).
+ *
+ * Simple, low overhead reverse mapping scheme.
+ * Please try to keep this thing as modular as possible.
+ *
+ * Provides methods for unmapping each kind of mapped page:
+ * the anon methods track anonymous pages, and
+ * the file methods track pages belonging to an inode.
+ *
+ * Original design by Rik van Riel <riel@conectiva.com.br> 2001
+ * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
+ * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
+ * Contributions by Hugh Dickins 2003, 2004
+ */
+
+/*
+ * Lock ordering in mm:
+ *
+ * inode->i_mutex (while writing or truncating, not reading or faulting)
+ * mm->mmap_lock
+ * page->flags PG_locked (lock_page) * (see huegtlbfs below)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * i_pages lock (widely used)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
+ *
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
+ * pte map lock
+ *
+ * * hugetlbfs PageHuge() pages take locks in this order:
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * page->flags PG_locked (lock_page)
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
+#include <linux/backing-dev.h>
+#include <linux/page_idle.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
+
+#include <asm/tlbflush.h>
+
+#include <trace/events/tlb.h>
+
+#include "internal.h"
+
+static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+ struct anon_vma *anon_vma;
+
+ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ if (anon_vma) {
+ atomic_set(&anon_vma->refcount, 1);
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
+ anon_vma->parent = anon_vma;
+ /*
+ * Initialise the anon_vma root to point to itself. If called
+ * from fork, the root will be reset to the parents anon_vma.
+ */
+ anon_vma->root = anon_vma;
+ }
+
+ return anon_vma;
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+ VM_BUG_ON(atomic_read(&anon_vma->refcount));
+
+ /*
+ * Synchronize against page_lock_anon_vma_read() such that
+ * we can safely hold the lock without the anon_vma getting
+ * freed.
+ *
+ * Relies on the full mb implied by the atomic_dec_and_test() from
+ * put_anon_vma() against the acquire barrier implied by
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+ *
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
+ * LOCK MB
+ * atomic_read() rwsem_is_locked()
+ *
+ * LOCK should suffice since the actual taking of the lock must
+ * happen _before_ what follows.
+ */
+ might_sleep();
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_unlock_write(anon_vma);
+ }
+
+ kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
+{
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
+}
+
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
+
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
+{
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
+}
+
+/**
+ * __anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, which
+ * is handled inline by anon_vma_prepare(). But if
+ * not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_lock held for reading.
+ */
+int __anon_vma_prepare(struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct anon_vma *anon_vma, *allocated;
+ struct anon_vma_chain *avc;
+
+ might_sleep();
+
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_enomem;
+
+ anon_vma = find_mergeable_anon_vma(vma);
+ allocated = NULL;
+ if (!anon_vma) {
+ anon_vma = anon_vma_alloc();
+ if (unlikely(!anon_vma))
+ goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
+ allocated = anon_vma;
+ }
+
+ anon_vma_lock_write(anon_vma);
+ /* page_table_lock to protect against threads */
+ spin_lock(&mm->page_table_lock);
+ if (likely(!vma->anon_vma)) {
+ vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->num_active_vmas++;
+ allocated = NULL;
+ avc = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(anon_vma);
+
+ if (unlikely(allocated))
+ put_anon_vma(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
+
+ return 0;
+
+ out_enomem_free_avc:
+ anon_vma_chain_free(avc);
+ out_enomem:
+ return -ENOMEM;
+}
+
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ up_write(&root->rwsem);
+ root = new_root;
+ down_write(&root->rwsem);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ up_write(&root->rwsem);
+}
+
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ *
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+{
+ struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
+
+ list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
+
+ /*
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
+ *
+ * Root anon_vma is never reused:
+ * it has self-parent reference and at least one child.
+ */
+ if (!dst->anon_vma && src->anon_vma &&
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
+ dst->anon_vma = anon_vma;
+ }
+ if (dst->anon_vma)
+ dst->anon_vma->num_active_vmas++;
+ unlock_anon_vma_root(root);
+ return 0;
+
+ enomem_failure:
+ /*
+ * dst->anon_vma is dropped here otherwise its degree can be incorrectly
+ * decremented in unlink_anon_vmas().
+ * We can safely do this because callers of anon_vma_clone() don't care
+ * about dst->anon_vma if anon_vma_clone() failed.
+ */
+ dst->anon_vma = NULL;
+ unlink_anon_vmas(dst);
+ return -ENOMEM;
+}
+
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
+{
+ struct anon_vma_chain *avc;
+ struct anon_vma *anon_vma;
+ int error;
+
+ /* Don't bother if the parent process has no anon_vma here. */
+ if (!pvma->anon_vma)
+ return 0;
+
+ /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+ vma->anon_vma = NULL;
+
+ /*
+ * First, attach the new VMA to the parent VMA's anon_vmas,
+ * so rmap can find non-COWed pages in child processes.
+ */
+ error = anon_vma_clone(vma, pvma);
+ if (error)
+ return error;
+
+ /* An existing anon_vma has been reused, all done then. */
+ if (vma->anon_vma)
+ return 0;
+
+ /* Then add our own anon_vma. */
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ goto out_error;
+ anon_vma->num_active_vmas++;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_error_free_anon_vma;
+
+ /*
+ * The root anon_vma's spinlock is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ anon_vma->parent = pvma->anon_vma;
+ /*
+ * With refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned until
+ * this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
+ /* Mark this anon_vma as the one where our new (COWed) pages go. */
+ vma->anon_vma = anon_vma;
+ anon_vma_lock_write(anon_vma);
+ anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->parent->num_children++;
+ anon_vma_unlock_write(anon_vma);
+
+ return 0;
+
+ out_error_free_anon_vma:
+ put_anon_vma(anon_vma);
+ out_error:
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
+}
+
+void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
+
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
+ anon_vma->parent->num_children--;
+ continue;
+ }
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ if (vma->anon_vma)
+ vma->anon_vma->num_active_vmas--;
+ unlock_anon_vma_root(root);
+
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to write-acquire the anon_vma->root->rwsem.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
+ put_anon_vma(anon_vma);
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+}
+
+static void anon_vma_ctor(void *data)
+{
+ struct anon_vma *anon_vma = data;
+
+ init_rwsem(&anon_vma->rwsem);
+ atomic_set(&anon_vma->refcount, 0);
+ anon_vma->rb_root = RB_ROOT_CACHED;
+}
+
+void __init anon_vma_init(void)
+{
+ anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+ 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+ anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+ SLAB_PANIC|SLAB_ACCOUNT);
+}
+
+/*
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
+ * if there is a mapcount, we can dereference the anon_vma after observing
+ * those.
+ */
+struct anon_vma *page_get_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma = NULL;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ /*
+ * If this page is still mapped, then its anon_vma cannot have been
+ * freed. But if it has been unmapped, we have no security against the
+ * anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
+ * above cannot corrupt).
+ */
+ if (!page_mapped(page)) {
+ rcu_read_unlock();
+ put_anon_vma(anon_vma);
+ return NULL;
+ }
+out:
+ rcu_read_unlock();
+
+ return anon_vma;
+}
+
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
+{
+ struct anon_vma *anon_vma = NULL;
+ struct anon_vma *root_anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ root_anon_vma = READ_ONCE(anon_vma->root);
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
+ /*
+ * If the page is still mapped, then this anon_vma is still
+ * its anon_vma, and holding the mutex ensures that it will
+ * not go away, see anon_vma_free().
+ */
+ if (!page_mapped(page)) {
+ up_read(&root_anon_vma->rwsem);
+ anon_vma = NULL;
+ }
+ goto out;
+ }
+
+ /* trylock failed, we got to sleep */
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ if (!page_mapped(page)) {
+ rcu_read_unlock();
+ put_anon_vma(anon_vma);
+ return NULL;
+ }
+
+ /* we pinned the anon_vma, its safe to sleep */
+ rcu_read_unlock();
+ anon_vma_lock_read(anon_vma);
+
+ if (atomic_dec_and_test(&anon_vma->refcount)) {
+ /*
+ * Oops, we held the last refcount, release the lock
+ * and bail -- can't simply use put_anon_vma() because
+ * we'll deadlock on the anon_vma_lock_write() recursion.
+ */
+ anon_vma_unlock_read(anon_vma);
+ __put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+
+ return anon_vma;
+
+out:
+ rcu_read_unlock();
+ return anon_vma;
+}
+
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
+{
+ anon_vma_unlock_read(anon_vma);
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ if (!tlb_ubc->flush_required)
+ return;
+
+ arch_tlbbatch_flush(&tlb_ubc->arch);
+ tlb_ubc->flush_required = false;
+ tlb_ubc->writable = false;
+}
+
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ if (tlb_ubc->writable)
+ try_to_unmap_flush();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+{
+ struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+ tlb_ubc->flush_required = true;
+
+ /*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
+ /*
+ * If the PTE was dirty then it's best to assume it's writable. The
+ * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+ * before the page is queued for IO.
+ */
+ if (writable)
+ tlb_ubc->writable = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ bool should_defer = false;
+
+ if (!(flags & TTU_BATCH_FLUSH))
+ return false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (data_race(mm->tlb_flush_batched)) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+ return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+/*
+ * At what user virtual address is page expected in vma?
+ * Caller should check the page is actually part of the vma.
+ */
+unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+ if (PageAnon(page)) {
+ struct anon_vma *page__anon_vma = page_anon_vma(page);
+ /*
+ * Note: swapoff's unuse_vma() is more efficient with this
+ * check, and needs it to match anon_vma when KSM is active.
+ */
+ if (!vma->anon_vma || !page__anon_vma ||
+ vma->anon_vma->root != page__anon_vma->root)
+ return -EFAULT;
+ } else if (!vma->vm_file) {
+ return -EFAULT;
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
+ return -EFAULT;
+ }
+
+ return vma_address(page, vma);
+}
+
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+ pmd_t pmde;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ goto out;
+
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /*
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ pmd = NULL;
+out:
+ return pmd;
+}
+
+struct page_referenced_arg {
+ int mapcount;
+ int referenced;
+ unsigned long vm_flags;
+ struct mem_cgroup *memcg;
+};
+/*
+ * arg: page_referenced_arg will be passed
+ */
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct page_referenced_arg *pra = arg;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ int referenced = 0;
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ address = pvmw.address;
+
+ if (vma->vm_flags & VM_LOCKED) {
+ page_vma_mapped_walk_done(&pvmw);
+ pra->vm_flags |= VM_LOCKED;
+ return false; /* To break the loop */
+ }
+
+ if (pvmw.pte) {
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
+ /*
+ * Don't treat a reference through
+ * a sequentially read mapping as such.
+ * If the page has been used in another mapping,
+ * we will catch it; if this other mapping is
+ * already gone, the unmap path will have set
+ * PG_referenced or activated the page.
+ */
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ referenced++;
+ }
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pmdp_clear_flush_young_notify(vma, address,
+ pvmw.pmd))
+ referenced++;
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
+ }
+
+ pra->mapcount--;
+ }
+
+ if (referenced)
+ clear_page_idle(page);
+ if (test_and_clear_page_young(page))
+ referenced++;
+
+ if (referenced) {
+ pra->referenced++;
+ pra->vm_flags |= vma->vm_flags;
+ }
+
+ if (!pra->mapcount)
+ return false; /* To break the loop */
+
+ return true;
+}
+
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+{
+ struct page_referenced_arg *pra = arg;
+ struct mem_cgroup *memcg = pra->memcg;
+
+ if (!mm_match_cgroup(vma->vm_mm, memcg))
+ return true;
+
+ return false;
+}
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ * @is_locked: caller holds lock on the page
+ * @memcg: target memory cgroup
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of ptes which referenced the page.
+ */
+int page_referenced(struct page *page,
+ int is_locked,
+ struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
+{
+ int we_locked = 0;
+ struct page_referenced_arg pra = {
+ .mapcount = total_mapcount(page),
+ .memcg = memcg,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_referenced_one,
+ .arg = (void *)&pra,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ *vm_flags = 0;
+ if (!pra.mapcount)
+ return 0;
+
+ if (!page_rmapping(page))
+ return 0;
+
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked)
+ return 1;
+ }
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg) {
+ rwc.invalid_vma = invalid_page_referenced_vma;
+ }
+
+ rmap_walk(page, &rwc);
+ *vm_flags = pra.vm_flags;
+
+ if (we_locked)
+ unlock_page(page);
+
+ return pra.referenced;
+}
+
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+ struct mmu_notifier_range range;
+ int *cleaned = arg;
+
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free from this function.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, vma->vm_mm, address,
+ vma_address_end(page, vma));
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ int ret = 0;
+
+ address = pvmw.address;
+ if (pvmw.pte) {
+ pte_t entry;
+ pte_t *pte = pvmw.pte;
+
+ if (!pte_dirty(*pte) && !pte_write(*pte))
+ continue;
+
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ ret = 1;
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t *pmd = pvmw.pmd;
+ pmd_t entry;
+
+ if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
+ continue;
+
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = pmdp_invalidate(vma, address, pmd);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkclean(entry);
+ set_pmd_at(vma->vm_mm, address, pmd, entry);
+ ret = 1;
+#else
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
+#endif
+ }
+
+ /*
+ * No need to call mmu_notifier_invalidate_range() as we are
+ * downgrading page table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ if (ret)
+ (*cleaned)++;
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return true;
+}
+
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
+{
+ if (vma->vm_flags & VM_SHARED)
+ return false;
+
+ return true;
+}
+
+int page_mkclean(struct page *page)
+{
+ int cleaned = 0;
+ struct address_space *mapping;
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&cleaned,
+ .rmap_one = page_mkclean_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ BUG_ON(!PageLocked(page));
+
+ if (!page_mapped(page))
+ return 0;
+
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ rmap_walk(page, &rwc);
+
+ return cleaned;
+}
+EXPORT_SYMBOL_GPL(page_mkclean);
+
+/**
+ * page_move_anon_rmap - move a page to our anon_vma
+ * @page: the page to move to our anon_vma
+ * @vma: the vma the page belongs to
+ *
+ * When a page belongs exclusively to one process after a COW event,
+ * that page can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling
+ * processes.
+ */
+void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ page = compound_head(page);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_VMA(!anon_vma, vma);
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ /*
+ * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * simultaneously, so a concurrent reader (eg page_referenced()'s
+ * PageAnon()) will not see one without the other.
+ */
+ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+}
+
+/**
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page: Page or Hugepage to add to rmap
+ * @vma: VM area to add page to.
+ * @address: User virtual address of the mapping
+ * @exclusive: the page is exclusively owned by the current process
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+
+ if (PageAnon(page))
+ return;
+
+ /*
+ * If the page isn't exclusively mapped into this vma,
+ * we must use the _oldest_ possible anon_vma for the
+ * page mapping!
+ */
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+/**
+ * __page_check_anon_rmap - sanity check anonymous rmap addition
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_check_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ /*
+ * The page's anon-rmap details (mapping and index) are guaranteed to
+ * be set up correctly at this point.
+ *
+ * We have exclusion against page_add_anon_rmap because the caller
+ * always holds the page locked, except if called from page_dup_rmap,
+ * in which case the page is already known to be setup.
+ *
+ * We have exclusion against page_add_new_anon_rmap because those pages
+ * are initially only visible via the pagetables, and the pte is locked
+ * over the call to page_add_new_anon_rmap.
+ */
+ VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ page);
+}
+
+/**
+ * page_add_anon_rmap - add pte mapping to an anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting,
+ * and to ensure that PageAnon is not being upgraded racily to PageKsm
+ * (but PageKsm is never downgraded to PageAnon).
+ */
+void page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, bool compound)
+{
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
+}
+
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int flags)
+{
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (unlikely(PageKsm(page)))
+ lock_page_memcg(page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (compound) {
+ atomic_t *mapcount;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
+ if (first) {
+ int nr = compound ? thp_nr_pages(page) : 1;
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption
+ * disabled.
+ */
+ if (compound)
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
+ }
+
+ if (unlikely(PageKsm(page))) {
+ unlock_page_memcg(page);
+ return;
+ }
+
+ /* address might be in next vma when migration races vma_adjust */
+ if (first)
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
+ else
+ __page_check_anon_rmap(page, vma, address);
+}
+
+/**
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ * Page does not have to be locked.
+ */
+void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, bool compound)
+{
+ int nr = compound ? thp_nr_pages(page) : 1;
+
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ __SetPageSwapBacked(page);
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
+ __page_set_anon_rmap(page, vma, address, 1);
+}
+
+/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page: the page to add the mapping to
+ * @compound: charge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_add_file_rmap(struct page *page, bool compound)
+{
+ int i, nr = 1;
+
+ VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
+ lock_page_memcg(page);
+ if (compound && PageTransHuge(page)) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ if (atomic_inc_and_test(&page[i]._mapcount))
+ nr++;
+ }
+ if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
+ goto out;
+ if (PageSwapBacked(page))
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
+ } else {
+ if (PageTransCompound(page) && page_mapping(page)) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+
+ SetPageDoubleMap(compound_head(page));
+ if (PageMlocked(page))
+ clear_page_mlock(compound_head(page));
+ }
+ if (!atomic_inc_and_test(&page->_mapcount))
+ goto out;
+ }
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
+out:
+ unlock_page_memcg(page);
+}
+
+static void page_remove_file_rmap(struct page *page, bool compound)
+{
+ int i, nr = 1;
+
+ VM_BUG_ON_PAGE(compound && !PageHead(page), page);
+
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
+ return;
+ }
+
+ /* page still mapped by someone else? */
+ if (compound && PageTransHuge(page)) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
+ } else {
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ return;
+ }
+
+ /*
+ * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+}
+
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * them are still mapped.
+ */
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+
+ /*
+ * Queue the page for deferred split if at least one small
+ * page of the compound page is unmapped, but at least one
+ * small page is still mapped.
+ */
+ if (nr && nr < thp_nr_pages(page))
+ deferred_split_huge_page(page);
+ } else {
+ nr = thp_nr_pages(page);
+ }
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+
+ if (nr)
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
+}
+
+/**
+ * page_remove_rmap - take down pte mapping from a page
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_remove_rmap(struct page *page, bool compound)
+{
+ lock_page_memcg(page);
+
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page, compound);
+ goto out;
+ }
+
+ if (compound) {
+ page_remove_anon_compound_rmap(page);
+ goto out;
+ }
+
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_unref_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
+out:
+ unlock_page_memcg(page);
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument
+ */
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
+
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
+ /* munlock has nothing to gain from examining un-locked vmas */
+ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+ return true;
+
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_SPLIT_FREEZE, page);
+ }
+
+ /*
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
+ */
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, range.end);
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ }
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte && (flags & TTU_MIGRATION)) {
+ VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
+ /*
+ * If the page is mlock()d, we cannot swap it out.
+ * If it's recently referenced (perhaps page_referenced
+ * skipped over this mm) then we should reactivate it.
+ */
+ if (!(flags & TTU_IGNORE_MLOCK)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ /* PTE-mapped THP are never mlocked */
+ if (!PageTransCompound(page)) {
+ /*
+ * Holding pte lock, we do *not* need
+ * mmap_lock here
+ */
+ mlock_vma_page(page);
+ }
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (flags & TTU_MUNLOCK)
+ continue;
+ }
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ if (PageHuge(page) && !PageAnon(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, 0);
+ swp_pte = swp_entry_to_pte(entry);
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ subpage = page;
+ goto discard;
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the page.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+
+ set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ if (PageHuge(page)) {
+ hugetlb_count_sub(compound_nr(page), mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
+ } else {
+ dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ }
+
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
+ */
+ dec_mm_counter(mm, mm_counter(page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(subpage,
+ pte_write(pteval));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ } else if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page_private(subpage) };
+ pte_t swp_pte;
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ WARN_ON_ONCE(1);
+ ret = false;
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /* MADV_FREE page check */
+ if (!PageSwapBacked(page)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = page_ref_count(page);
+ map_count = page_mapcount(page);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !PageDirty(page)) {
+ /* Invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm,
+ address, address + PAGE_SIZE);
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ }
+
+ /*
+ * If the page was redirtied, it cannot be
+ * discarded. Remap the page to page table.
+ */
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ SetPageSwapBacked(page);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /* Invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ /*
+ * This is a locked file-backed page, thus it cannot
+ * be removed from the page cache and replaced by a new
+ * page before mmu_notifier_invalidate_range_end, so no
+ * concurrent thread might update its page table to
+ * point at new page while a device still is using this
+ * page.
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ dec_mm_counter(mm, mm_counter_file(page));
+ }
+discard:
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ page_remove_rmap(subpage, PageHuge(page));
+ put_page(page);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+ return vma_is_temporary_stack(vma);
+}
+
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path. Caller must hold the page lock.
+ *
+ * If unmap is successful, return true. Otherwise, false.
+ */
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
+ && !PageKsm(page) && PageAnon(page))
+ rwc.invalid_vma = invalid_migration_vma;
+
+ if (flags & TTU_RMAP_LOCKED)
+ rmap_walk_locked(page, &rwc);
+ else
+ rmap_walk(page, &rwc);
+
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
+}
+
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code. Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ */
+
+void try_to_munlock(struct page *page)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)TTU_MUNLOCK,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+
+ };
+
+ VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
+ rmap_walk(page, &rwc);
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma)
+{
+ struct anon_vma *root = anon_vma->root;
+
+ anon_vma_free(anon_vma);
+ if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ anon_vma_free(root);
+}
+
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+ struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+
+ if (rwc->anon_lock)
+ return rwc->anon_lock(page);
+
+ /*
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+ * because that depends on page_mapped(); but not all its usages
+ * are holding mmap_lock. Users without mmap_lock are required to
+ * take a reference count to prevent the anon_vma disappearing
+ */
+ anon_vma = page_anon_vma(page);
+ if (!anon_vma)
+ return NULL;
+
+ anon_vma_lock_read(anon_vma);
+ return anon_vma;
+}
+
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff_start, pgoff_end;
+ struct anon_vma_chain *avc;
+
+ if (locked) {
+ anon_vma = page_anon_vma(page);
+ /* anon_vma disappear under us? */
+ VM_BUG_ON_PAGE(!anon_vma, page);
+ } else {
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ }
+ if (!anon_vma)
+ return;
+
+ pgoff_start = page_to_pgoff(page);
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff_start, pgoff_end) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long address = vma_address(page, vma);
+
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
+ cond_resched();
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ break;
+ if (rwc->done && rwc->done(page))
+ break;
+ }
+
+ if (!locked)
+ anon_vma_unlock_read(anon_vma);
+}
+
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
+{
+ struct address_space *mapping = page_mapping(page);
+ pgoff_t pgoff_start, pgoff_end;
+ struct vm_area_struct *vma;
+
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (!mapping)
+ return;
+
+ pgoff_start = page_to_pgoff(page);
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+ if (!locked)
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap,
+ pgoff_start, pgoff_end) {
+ unsigned long address = vma_address(page, vma);
+
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
+ cond_resched();
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ goto done;
+ if (rwc->done && rwc->done(page))
+ goto done;
+ }
+
+done:
+ if (!locked)
+ i_mmap_unlock_read(mapping);
+}
+
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+{
+ if (unlikely(PageKsm(page)))
+ rmap_walk_ksm(page, rwc);
+ else if (PageAnon(page))
+ rmap_walk_anon(page, rwc, false);
+ else
+ rmap_walk_file(page, rwc, false);
+}
+
+/* Like rmap_walk, but caller holds relevant rmap lock */
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+{
+ /* no ksm support for now */
+ VM_BUG_ON_PAGE(PageKsm(page), page);
+ if (PageAnon(page))
+ rmap_walk_anon(page, rwc, true);
+ else
+ rmap_walk_file(page, rwc, true);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following two functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!anon_vma);
+ /* address might be in next vma when migration races vma_adjust */
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ if (first)
+ __page_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
+ __page_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
new file mode 100644
index 000000000..261337194
--- /dev/null
+++ b/mm/rodata_test.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rodata_test.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ */
+#define pr_fmt(fmt) "rodata_test: " fmt
+
+#include <linux/rodata_test.h>
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+
+static const int rodata_test_data = 0xC3;
+
+void rodata_test(void)
+{
+ unsigned long start, end;
+ int zero = 0;
+
+ /* test 1: read the value */
+ /* If this test fails, some previous testrun has clobbered the state */
+ if (!rodata_test_data) {
+ pr_err("test 1 fails (start data)\n");
+ return;
+ }
+
+ /* test 2: write to the variable; this should fault */
+ if (!copy_to_kernel_nofault((void *)&rodata_test_data,
+ (void *)&zero, sizeof(zero))) {
+ pr_err("test data was not read only\n");
+ return;
+ }
+
+ /* test 3: check the value hasn't changed */
+ if (rodata_test_data == zero) {
+ pr_err("test data was changed\n");
+ return;
+ }
+
+ /* test 4: check if the rodata section is PAGE_SIZE aligned */
+ start = (unsigned long)__start_rodata;
+ end = (unsigned long)__end_rodata;
+ if (start & (PAGE_SIZE - 1)) {
+ pr_err("start of .rodata is not page size aligned\n");
+ return;
+ }
+ if (end & (PAGE_SIZE - 1)) {
+ pr_err("end of .rodata is not page size aligned\n");
+ return;
+ }
+
+ pr_info("all tests were successful\n");
+}
diff --git a/mm/shmem.c b/mm/shmem.c
new file mode 100644
index 000000000..e173d83b4
--- /dev/null
+++ b/mm/shmem.c
@@ -0,0 +1,4350 @@
+/*
+ * Resizable virtual memory filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ * 2000 Transmeta Corp.
+ * 2000-2001 Christoph Rohland
+ * 2000-2001 SAP AG
+ * 2002 Red Hat Inc.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
+ * Copyright (C) 2004 Andi Kleen, SuSE Labs
+ *
+ * Extended attribute support for tmpfs:
+ * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * tiny-shmem:
+ * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/sched/signal.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
+#include <linux/frontswap.h>
+#include <linux/fs_parser.h>
+
+#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+
+static struct vfsmount *shm_mnt;
+
+#ifdef CONFIG_SHMEM
+/*
+ * This virtual memory filesystem is heavily based on the ramfs. It
+ * extends ramfs by the ability to use swap and honor resource limits
+ * which makes it a completely usable filesystem.
+ */
+
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/shmem_fs.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
+#include <linux/falloc.h>
+#include <linux/splice.h>
+#include <linux/security.h>
+#include <linux/swapops.h>
+#include <linux/mempolicy.h>
+#include <linux/namei.h>
+#include <linux/ctype.h>
+#include <linux/migrate.h>
+#include <linux/highmem.h>
+#include <linux/seq_file.h>
+#include <linux/magic.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/rmap.h>
+#include <linux/uuid.h>
+
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
+#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
+
+/* Pretend that each entry is of this size in directory's i_size */
+#define BOGO_DIRENT_SIZE 20
+
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
+/*
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+ wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
+ pgoff_t start; /* start of range currently being fallocated */
+ pgoff_t next; /* the next page offset to be fallocated */
+ pgoff_t nr_falloced; /* how many new pages have been fallocated */
+ pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+};
+
+struct shmem_options {
+ unsigned long long blocks;
+ unsigned long long inodes;
+ struct mempolicy *mpol;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ bool full_inums;
+ int huge;
+ int seen;
+#define SHMEM_SEEN_BLOCKS 1
+#define SHMEM_SEEN_INODES 2
+#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
+};
+
+#ifdef CONFIG_TMPFS
+static unsigned long shmem_default_max_blocks(void)
+{
+ return totalram_pages() / 2;
+}
+
+static unsigned long shmem_default_max_inodes(void)
+{
+ unsigned long nr_pages = totalram_pages();
+
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
+}
+#endif
+
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index);
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type);
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ struct vm_fault *vmf, vm_fault_t *fault_type);
+
+int shmem_getpage(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp)
+{
+ return shmem_getpage_gfp(inode, index, pagep, sgp,
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+}
+
+static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/*
+ * shmem_file_setup pre-accounts the whole fixed size of a VM object,
+ * for shared memory and for shared anonymous (/dev/zero) mappings
+ * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
+ * consistent with the pre-accounting of private mappings ...
+ */
+static inline int shmem_acct_size(unsigned long flags, loff_t size)
+{
+ return (flags & VM_NORESERVE) ?
+ 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
+}
+
+static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+{
+ if (!(flags & VM_NORESERVE))
+ vm_unacct_memory(VM_ACCT(size));
+}
+
+static inline int shmem_reacct_size(unsigned long flags,
+ loff_t oldsize, loff_t newsize)
+{
+ if (!(flags & VM_NORESERVE)) {
+ if (VM_ACCT(newsize) > VM_ACCT(oldsize))
+ return security_vm_enough_memory_mm(current->mm,
+ VM_ACCT(newsize) - VM_ACCT(oldsize));
+ else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
+ vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
+ }
+ return 0;
+}
+
+/*
+ * ... whereas tmpfs objects are accounted incrementally as
+ * pages are allocated, in order to allow large sparse files.
+ * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
+ */
+static inline int shmem_acct_block(unsigned long flags, long pages)
+{
+ if (!(flags & VM_NORESERVE))
+ return 0;
+
+ return security_vm_enough_memory_mm(current->mm,
+ pages * VM_ACCT(PAGE_SIZE));
+}
+
+static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+{
+ if (flags & VM_NORESERVE)
+ vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
+}
+
+static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (shmem_acct_block(info->flags, pages))
+ return false;
+
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - pages) > 0)
+ goto unacct;
+ percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
+
+ return true;
+
+unacct:
+ shmem_unacct_blocks(info->flags, pages);
+ return false;
+}
+
+static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (sbinfo->max_blocks)
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
+ shmem_unacct_blocks(info->flags, pages);
+}
+
+static const struct super_operations shmem_ops;
+static const struct address_space_operations shmem_aops;
+static const struct file_operations shmem_file_operations;
+static const struct inode_operations shmem_inode_operations;
+static const struct inode_operations shmem_dir_inode_operations;
+static const struct inode_operations shmem_special_inode_operations;
+static const struct vm_operations_struct shmem_vm_ops;
+static struct file_system_type shmem_fs_type;
+
+bool vma_is_shmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &shmem_vm_ops;
+}
+
+static LIST_HEAD(shmem_swaplist);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
+
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_inodes) {
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ }
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
+ }
+ spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
+ }
+
+ return 0;
+}
+
+static void shmem_free_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_inodes++;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+}
+
+/**
+ * shmem_recalc_inode - recalculate the block usage of an inode
+ * @inode: inode to recalc
+ *
+ * We have to calculate the free blocks since the mm can drop
+ * undirtied hole pages behind our back.
+ *
+ * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
+ * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * It has to be called with the spinlock held.
+ */
+static void shmem_recalc_inode(struct inode *inode)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ long freed;
+
+ freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+ if (freed > 0) {
+ info->alloced -= freed;
+ inode->i_blocks -= freed * BLOCKS_PER_PAGE;
+ shmem_inode_unacct_blocks(inode, freed);
+ }
+}
+
+bool shmem_charge(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned long flags;
+
+ if (!shmem_inode_acct_block(inode, pages))
+ return false;
+
+ /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
+ inode->i_mapping->nrpages += pages;
+
+ spin_lock_irqsave(&info->lock, flags);
+ info->alloced += pages;
+ inode->i_blocks += pages * BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irqrestore(&info->lock, flags);
+
+ return true;
+}
+
+void shmem_uncharge(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned long flags;
+
+ /* nrpages adjustment done by __delete_from_page_cache() or caller */
+
+ spin_lock_irqsave(&info->lock, flags);
+ info->alloced -= pages;
+ inode->i_blocks -= pages * BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irqrestore(&info->lock, flags);
+
+ shmem_inode_unacct_blocks(inode, pages);
+}
+
+/*
+ * Replace item expected in xarray by a new item, while holding xa_lock.
+ */
+static int shmem_replace_entry(struct address_space *mapping,
+ pgoff_t index, void *expected, void *replacement)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ void *item;
+
+ VM_BUG_ON(!expected);
+ VM_BUG_ON(!replacement);
+ item = xas_load(&xas);
+ if (item != expected)
+ return -ENOENT;
+ xas_store(&xas, replacement);
+ return 0;
+}
+
+/*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+ pgoff_t index, swp_entry_t swap)
+{
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+}
+
+/*
+ * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
+ *
+ * SHMEM_HUGE_NEVER:
+ * disables huge pages for the mount;
+ * SHMEM_HUGE_ALWAYS:
+ * enables huge pages for the mount;
+ * SHMEM_HUGE_WITHIN_SIZE:
+ * only allocate huge pages if the page will be fully within i_size,
+ * also respect fadvise()/madvise() hints;
+ * SHMEM_HUGE_ADVISE:
+ * only allocate huge pages if requested with fadvise()/madvise();
+ */
+
+#define SHMEM_HUGE_NEVER 0
+#define SHMEM_HUGE_ALWAYS 1
+#define SHMEM_HUGE_WITHIN_SIZE 2
+#define SHMEM_HUGE_ADVISE 3
+
+/*
+ * Special values.
+ * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
+ *
+ * SHMEM_HUGE_DENY:
+ * disables huge on shm_mnt and all mounts, for emergency use;
+ * SHMEM_HUGE_FORCE:
+ * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
+ *
+ */
+#define SHMEM_HUGE_DENY (-1)
+#define SHMEM_HUGE_FORCE (-2)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* ifdef here to avoid bloating shmem.o when not necessary */
+
+static int shmem_huge __read_mostly;
+
+#if defined(CONFIG_SYSFS)
+static int shmem_parse_huge(const char *str)
+{
+ if (!strcmp(str, "never"))
+ return SHMEM_HUGE_NEVER;
+ if (!strcmp(str, "always"))
+ return SHMEM_HUGE_ALWAYS;
+ if (!strcmp(str, "within_size"))
+ return SHMEM_HUGE_WITHIN_SIZE;
+ if (!strcmp(str, "advise"))
+ return SHMEM_HUGE_ADVISE;
+ if (!strcmp(str, "deny"))
+ return SHMEM_HUGE_DENY;
+ if (!strcmp(str, "force"))
+ return SHMEM_HUGE_FORCE;
+ return -EINVAL;
+}
+#endif
+
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+static const char *shmem_format_huge(int huge)
+{
+ switch (huge) {
+ case SHMEM_HUGE_NEVER:
+ return "never";
+ case SHMEM_HUGE_ALWAYS:
+ return "always";
+ case SHMEM_HUGE_WITHIN_SIZE:
+ return "within_size";
+ case SHMEM_HUGE_ADVISE:
+ return "advise";
+ case SHMEM_HUGE_DENY:
+ return "deny";
+ case SHMEM_HUGE_FORCE:
+ return "force";
+ default:
+ VM_BUG_ON(1);
+ return "bad_val";
+ }
+}
+#endif
+
+static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ struct shrink_control *sc, unsigned long nr_to_split)
+{
+ LIST_HEAD(list), *pos, *next;
+ LIST_HEAD(to_remove);
+ struct inode *inode;
+ struct shmem_inode_info *info;
+ struct page *page;
+ unsigned long batch = sc ? sc->nr_to_scan : 128;
+ int split = 0;
+
+ if (list_empty(&sbinfo->shrinklist))
+ return SHRINK_STOP;
+
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_for_each_safe(pos, next, &sbinfo->shrinklist) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+
+ /* pin the inode */
+ inode = igrab(&info->vfs_inode);
+
+ /* inode is about to be evicted */
+ if (!inode) {
+ list_del_init(&info->shrinklist);
+ goto next;
+ }
+
+ /* Check if there's anything to gain */
+ if (round_up(inode->i_size, PAGE_SIZE) ==
+ round_up(inode->i_size, HPAGE_PMD_SIZE)) {
+ list_move(&info->shrinklist, &to_remove);
+ goto next;
+ }
+
+ list_move(&info->shrinklist, &list);
+next:
+ sbinfo->shrinklist_len--;
+ if (!--batch)
+ break;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+
+ list_for_each_safe(pos, next, &to_remove) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+ list_del_init(&info->shrinklist);
+ iput(inode);
+ }
+
+ list_for_each_safe(pos, next, &list) {
+ int ret;
+
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+
+ if (nr_to_split && split >= nr_to_split)
+ goto move_back;
+
+ page = find_get_page(inode->i_mapping,
+ (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
+ if (!page)
+ goto drop;
+
+ /* No huge page at the end of the file: nothing to split */
+ if (!PageTransHuge(page)) {
+ put_page(page);
+ goto drop;
+ }
+
+ /*
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
+ *
+ * Waiting for the lock may lead to deadlock in the
+ * reclaim path.
+ */
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto move_back;
+ }
+
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+
+ /* If split failed move the inode on the list back to shrinklist */
+ if (ret)
+ goto move_back;
+
+ split++;
+drop:
+ list_del_init(&info->shrinklist);
+ goto put;
+move_back:
+ /*
+ * Make sure the inode is either on the global list or deleted
+ * from any local list before iput() since it could be deleted
+ * in another thread once we put the inode (then the local list
+ * is corrupted).
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+put:
+ iput(inode);
+ }
+
+ return split;
+}
+
+static long shmem_unused_huge_scan(struct super_block *sb,
+ struct shrink_control *sc)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ if (!READ_ONCE(sbinfo->shrinklist_len))
+ return SHRINK_STOP;
+
+ return shmem_unused_huge_shrink(sbinfo, sc, 0);
+}
+
+static long shmem_unused_huge_count(struct super_block *sb,
+ struct shrink_control *sc)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ return READ_ONCE(sbinfo->shrinklist_len);
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+
+#define shmem_huge SHMEM_HUGE_DENY
+
+static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ struct shrink_control *sc, unsigned long nr_to_split)
+{
+ return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
+ shmem_huge != SHMEM_HUGE_DENY)
+ return true;
+ return false;
+}
+
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+ struct address_space *mapping,
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
+{
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
+ unsigned long i = 0;
+ unsigned long nr = compound_nr(page);
+ int error;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(index != round_down(index, nr), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON(expected && PageTransHuge(page));
+
+ page_ref_add(page, nr);
+ page->mapping = mapping;
+ page->index = index;
+
+ if (!PageSwapCache(page)) {
+ error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto error;
+ }
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
+ do {
+ void *entry;
+ xas_lock_irq(&xas);
+ entry = xas_find_conflict(&xas);
+ if (entry != expected)
+ xas_set_err(&xas, -EEXIST);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+next:
+ xas_store(&xas, page);
+ if (++i < nr) {
+ xas_next(&xas);
+ goto next;
+ }
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_ALLOC);
+ __inc_node_page_state(page, NR_SHMEM_THPS);
+ }
+ mapping->nrpages += nr;
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
+ goto error;
+ }
+
+ return 0;
+error:
+ page->mapping = NULL;
+ page_ref_sub(page, nr);
+ return error;
+}
+
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+{
+ struct address_space *mapping = page->mapping;
+ int error;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+
+ xa_lock_irq(&mapping->i_pages);
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
+ page->mapping = NULL;
+ mapping->nrpages--;
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
+ __dec_lruvec_page_state(page, NR_SHMEM);
+ xa_unlock_irq(&mapping->i_pages);
+ put_page(page);
+ BUG_ON(error);
+}
+
+/*
+ * Remove swap entry from page cache, free the swap and its page cache.
+ */
+static int shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
+{
+ void *old;
+
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
+ if (old != radswap)
+ return -ENOENT;
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return 0;
+}
+
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given offsets are swapped out.
+ *
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ unsigned long swapped = 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, end - 1) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ swapped++;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+
+ rcu_read_unlock();
+
+ return swapped << PAGE_SHIFT;
+}
+
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long swapped;
+
+ /* Be careful as we don't hold info->lock */
+ swapped = READ_ONCE(info->swapped);
+
+ /*
+ * The easier cases are when the shmem object has nothing in swap, or
+ * the vma maps it whole. Then we can simply use the stats that we
+ * already track.
+ */
+ if (!swapped)
+ return 0;
+
+ if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+ return swapped << PAGE_SHIFT;
+
+ /* Here comes the more involved part */
+ return shmem_partial_swap_usage(mapping,
+ linear_page_index(vma, vma->vm_start),
+ linear_page_index(vma, vma->vm_end));
+}
+
+/*
+ * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
+ */
+void shmem_unlock_mapping(struct address_space *mapping)
+{
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index = 0;
+
+ pagevec_init(&pvec);
+ /*
+ * Minor point, but we might as well stop if someone else SHM_LOCKs it.
+ */
+ while (!mapping_unevictable(mapping)) {
+ /*
+ * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
+ * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
+ */
+ pvec.nr = find_get_entries(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ index = indices[pvec.nr - 1] + 1;
+ pagevec_remove_exceptionals(&pvec);
+ check_move_unevictable_pages(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+/*
+ * Check whether a hole-punch or truncation needs to split a huge page,
+ * returning true if no split was required, or the split has been successful.
+ *
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
+ * head, and then succeeded to trylock on tail.
+ *
+ * A split can only succeed when there are no additional references on the
+ * huge page: so the split below relies upon find_get_entries() having stopped
+ * when it found a subpage of the huge page, without getting further references.
+ */
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+{
+ if (!PageTransCompound(page))
+ return true;
+
+ /* Just proceed to delete a huge page wholly within the range punched */
+ if (PageHead(page) &&
+ page->index >= start && page->index + HPAGE_PMD_NR <= end)
+ return true;
+
+ /* Try to split huge page, so we can truly punch the hole or truncate */
+ return split_huge_page(page) >= 0;
+}
+
+/*
+ * Remove range of pages and swap entries from page cache, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
+ */
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ bool unfalloc)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgoff_t end = (lend + 1) >> PAGE_SHIFT;
+ unsigned int partial_start = lstart & (PAGE_SIZE - 1);
+ unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ long nr_swaps_freed = 0;
+ pgoff_t index;
+ int i;
+
+ if (lend == -1)
+ end = -1; /* unsigned, so actually very big */
+
+ pagevec_init(&pvec);
+ index = start;
+ while (index < end) {
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (xa_is_value(page)) {
+ if (unfalloc)
+ continue;
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
+
+ if (!trylock_page(page))
+ continue;
+
+ if ((!unfalloc || !PageUptodate(page)) &&
+ page_mapping(page) == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
+ truncate_inode_page(mapping, page);
+ }
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+
+ if (partial_start) {
+ struct page *page = NULL;
+ shmem_getpage(inode, start - 1, &page, SGP_READ);
+ if (page) {
+ unsigned int top = PAGE_SIZE;
+ if (start > end) {
+ top = partial_end;
+ partial_end = 0;
+ }
+ zero_user_segment(page, partial_start, top);
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+ if (partial_end) {
+ struct page *page = NULL;
+ shmem_getpage(inode, end, &page, SGP_READ);
+ if (page) {
+ zero_user_segment(page, 0, partial_end);
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+ if (start >= end)
+ return;
+
+ index = start;
+ while (index < end) {
+ cond_resched();
+
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
+ if (!pvec.nr) {
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
+ break;
+ /* But if truncating, restart to make sure all gone */
+ index = start;
+ continue;
+ }
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (xa_is_value(page)) {
+ if (unfalloc)
+ continue;
+ if (shmem_free_swap(mapping, index, page)) {
+ /* Swap was replaced by page: retry */
+ index--;
+ break;
+ }
+ nr_swaps_freed++;
+ continue;
+ }
+
+ lock_page(page);
+
+ if (!unfalloc || !PageUptodate(page)) {
+ if (page_mapping(page) != mapping) {
+ /* Page was replaced by swap: retry */
+ unlock_page(page);
+ index--;
+ break;
+ }
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (shmem_punch_compound(page, start, end))
+ truncate_inode_page(mapping, page);
+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Wipe the page and don't get stuck */
+ clear_highpage(page);
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ if (index <
+ round_up(start, HPAGE_PMD_NR))
+ start = index + 1;
+ }
+ }
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ index++;
+ }
+
+ spin_lock_irq(&info->lock);
+ info->swapped -= nr_swaps_freed;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+}
+
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ shmem_undo_range(inode, lstart, lend, false);
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
+static int shmem_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = path->dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
+
+ if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
+ spin_lock_irq(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ }
+ generic_fillattr(inode, stat);
+
+ if (is_huge_enabled(sb_info))
+ stat->blksize = HPAGE_PMD_SIZE;
+
+ return 0;
+}
+
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int error;
+
+ error = setattr_prepare(dentry, attr);
+ if (error)
+ return error;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
+
+ /* protected by i_mutex */
+ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
+ (newsize > oldsize && (info->seals & F_SEAL_GROW)))
+ return -EPERM;
+
+ if (newsize != oldsize) {
+ error = shmem_reacct_size(SHMEM_I(inode)->flags,
+ oldsize, newsize);
+ if (error)
+ return error;
+ i_size_write(inode, newsize);
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ }
+ if (newsize <= oldsize) {
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ if (oldsize > holebegin)
+ unmap_mapping_range(inode->i_mapping,
+ holebegin, 0, 1);
+ if (info->alloced)
+ shmem_truncate_range(inode,
+ newsize, (loff_t)-1);
+ /* unmap again to remove racily COWed private pages */
+ if (oldsize > holebegin)
+ unmap_mapping_range(inode->i_mapping,
+ holebegin, 0, 1);
+
+ /*
+ * Part of the huge page can be beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ spin_lock(&sbinfo->shrinklist_lock);
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+ }
+ }
+
+ setattr_copy(inode, attr);
+ if (attr->ia_valid & ATTR_MODE)
+ error = posix_acl_chmod(inode, inode->i_mode);
+ return error;
+}
+
+static void shmem_evict_inode(struct inode *inode)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (inode->i_mapping->a_ops == &shmem_aops) {
+ shmem_unacct_size(info->flags, inode->i_size);
+ inode->i_size = 0;
+ shmem_truncate_range(inode, 0, (loff_t)-1);
+ if (!list_empty(&info->shrinklist)) {
+ spin_lock(&sbinfo->shrinklist_lock);
+ if (!list_empty(&info->shrinklist)) {
+ list_del_init(&info->shrinklist);
+ sbinfo->shrinklist_len--;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+ while (!list_empty(&info->swaplist)) {
+ /* Wait while shmem_unuse() is scanning this inode... */
+ wait_var_event(&info->stop_eviction,
+ !atomic_read(&info->stop_eviction));
+ mutex_lock(&shmem_swaplist_mutex);
+ /* ...but beware of the race if we peeked too early */
+ if (!atomic_read(&info->stop_eviction))
+ list_del_init(&info->swaplist);
+ mutex_unlock(&shmem_swaplist_mutex);
+ }
+ }
+
+ simple_xattrs_free(&info->xattrs);
+ WARN_ON(inode->i_blocks);
+ shmem_free_inode(inode->i_sb);
+ clear_inode(inode);
+}
+
+extern struct swap_info_struct *swap_info[];
+
+static int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices,
+ unsigned int type, bool frontswap)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ swp_entry_t entry;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ if (!xa_is_value(page))
+ continue;
+
+ entry = radix_to_swp_entry(page);
+ if (swp_type(entry) != type)
+ continue;
+ if (frontswap &&
+ !frontswap_test(swap_info[type], swp_offset(entry)))
+ continue;
+
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ if (++ret == nr_entries)
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Move the swapped pages for an inode to page cache. Returns the count
+ * of pages swapped in, or the error in case of failure.
+ */
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
+ pgoff_t *indices)
+{
+ int i = 0;
+ int ret = 0;
+ int error = 0;
+ struct address_space *mapping = inode->i_mapping;
+
+ for (i = 0; i < pvec.nr; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!xa_is_value(page))
+ continue;
+ error = shmem_swapin_page(inode, indices[i],
+ &page, SGP_CACHE,
+ mapping_gfp_mask(mapping),
+ NULL, NULL);
+ if (error == 0) {
+ unlock_page(page);
+ put_page(page);
+ ret++;
+ }
+ if (error == -ENOMEM)
+ break;
+ error = 0;
+ }
+ return error ? error : ret;
+}
+
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start = 0;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
+ int ret = 0;
+
+ pagevec_init(&pvec);
+ do {
+ unsigned int nr_entries = PAGEVEC_SIZE;
+
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
+ nr_entries = *fs_pages_to_unuse;
+
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
+ pvec.pages, indices,
+ type, frontswap);
+ if (pvec.nr == 0) {
+ ret = 0;
+ break;
+ }
+
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ if (ret < 0)
+ break;
+
+ if (frontswap_partial) {
+ *fs_pages_to_unuse -= ret;
+ if (*fs_pages_to_unuse == 0) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ break;
+ }
+ }
+
+ start = indices[pvec.nr - 1];
+ } while (true);
+
+ return ret;
+}
+
+/*
+ * Read all the shared memory data that resides in the swap
+ * device 'type' back into memory, so the swap device can be
+ * unused.
+ */
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ struct shmem_inode_info *info, *next;
+ int error = 0;
+
+ if (list_empty(&shmem_swaplist))
+ return 0;
+
+ mutex_lock(&shmem_swaplist_mutex);
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
+ if (!info->swapped) {
+ list_del_init(&info->swaplist);
+ continue;
+ }
+ /*
+ * Drop the swaplist mutex while searching the inode for swap;
+ * but before doing so, make sure shmem_evict_inode() will not
+ * remove placeholder inode from swaplist, nor let it be freed
+ * (igrab() would protect from unlink, but not from unmount).
+ */
+ atomic_inc(&info->stop_eviction);
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
+ fs_pages_to_unuse);
+ cond_resched();
+
+ mutex_lock(&shmem_swaplist_mutex);
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
+ if (atomic_dec_and_test(&info->stop_eviction))
+ wake_up_var(&info->stop_eviction);
+ if (error)
+ break;
+ }
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ return error;
+}
+
+/*
+ * Move the page from the page cache to the swap cache.
+ */
+static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct shmem_inode_info *info;
+ struct address_space *mapping;
+ struct inode *inode;
+ swp_entry_t swap;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ BUG_ON(!PageLocked(page));
+ mapping = page->mapping;
+ index = page->index;
+ inode = mapping->host;
+ info = SHMEM_I(inode);
+ if (info->flags & VM_LOCKED)
+ goto redirty;
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
+ */
+ if (!wbc->for_reclaim) {
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ goto redirty;
+ }
+
+ /*
+ * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+ * value into swapfile.c, the only way we can correctly account for a
+ * fallocated page arriving here is now to initialize it and write it.
+ *
+ * That's okay for a page already fallocated earlier, but if we have
+ * not yet completed the fallocation, then (a) we want to keep track
+ * of this page in case we have to undo it, and (b) it may not be a
+ * good idea to continue anyway, once we're pushing into swap. So
+ * reactivate the page, and let shmem_fallocate() quit when too many.
+ */
+ if (!PageUptodate(page)) {
+ if (inode->i_private) {
+ struct shmem_falloc *shmem_falloc;
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ !shmem_falloc->waitq &&
+ index >= shmem_falloc->start &&
+ index < shmem_falloc->next)
+ shmem_falloc->nr_unswapped++;
+ else
+ shmem_falloc = NULL;
+ spin_unlock(&inode->i_lock);
+ if (shmem_falloc)
+ goto redirty;
+ }
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ swap = get_swap_page(page);
+ if (!swap.val)
+ goto redirty;
+
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the page is
+ * moved to swap cache, when its pagelock no longer protects
+ * the inode from eviction. But don't unlock the mutex until
+ * we've incremented swapped, because shmem_unuse_inode() will
+ * prune a !swapped inode from the swaplist under this mutex.
+ */
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add(&info->swaplist, &shmem_swaplist);
+
+ if (add_to_swap_cache(page, swap,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
+ spin_lock_irq(&info->lock);
+ shmem_recalc_inode(inode);
+ info->swapped++;
+ spin_unlock_irq(&info->lock);
+
+ swap_shmem_alloc(swap);
+ shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+ mutex_unlock(&shmem_swaplist_mutex);
+ BUG_ON(page_mapped(page));
+ swap_writepage(page, wbc);
+ return 0;
+ }
+
+ mutex_unlock(&shmem_swaplist_mutex);
+ put_swap_page(page, swap);
+redirty:
+ set_page_dirty(page);
+ if (wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
+ unlock_page(page);
+ return 0;
+}
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+ char buffer[64];
+
+ if (!mpol || mpol->mode == MPOL_DEFAULT)
+ return; /* show nothing */
+
+ mpol_to_str(buffer, sizeof(buffer), mpol);
+
+ seq_printf(seq, ",mpol=%s", buffer);
+}
+
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ struct mempolicy *mpol = NULL;
+ if (sbinfo->mpol) {
+ spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ mpol = sbinfo->mpol;
+ mpol_get(mpol);
+ spin_unlock(&sbinfo->stat_lock);
+ }
+ return mpol;
+}
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
+
+static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ /* Create a pseudo vma that just contains the policy */
+ vma_init(vma, NULL);
+ /* Bias interleave by inode number to distribute better across nodes */
+ vma->vm_pgoff = index + info->vfs_inode.i_ino;
+ vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+}
+
+static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
+{
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(vma->vm_policy);
+}
+
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct vm_area_struct pvma;
+ struct page *page;
+ struct vm_fault vmf;
+
+ shmem_pseudo_vma_init(&pvma, info, index);
+ vmf.vma = &pvma;
+ vmf.address = 0;
+ page = swap_cluster_readahead(swap, gfp, &vmf);
+ shmem_pseudo_vma_destroy(&pvma);
+
+ return page;
+}
+
+static struct page *shmem_alloc_hugepage(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct vm_area_struct pvma;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ pgoff_t hindex;
+ struct page *page;
+
+ hindex = round_down(index, HPAGE_PMD_NR);
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
+ XA_PRESENT))
+ return NULL;
+
+ shmem_pseudo_vma_init(&pvma, info, hindex);
+ page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
+ HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ shmem_pseudo_vma_destroy(&pvma);
+ if (page)
+ prep_transhuge_page(page);
+ else
+ count_vm_event(THP_FILE_FALLBACK);
+ return page;
+}
+
+static struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct vm_area_struct pvma;
+ struct page *page;
+
+ shmem_pseudo_vma_init(&pvma, info, index);
+ page = alloc_page_vma(gfp, &pvma, 0);
+ shmem_pseudo_vma_destroy(&pvma);
+
+ return page;
+}
+
+static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
+ struct inode *inode,
+ pgoff_t index, bool huge)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct page *page;
+ int nr;
+ int err = -ENOSPC;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ huge = false;
+ nr = huge ? HPAGE_PMD_NR : 1;
+
+ if (!shmem_inode_acct_block(inode, nr))
+ goto failed;
+
+ if (huge)
+ page = shmem_alloc_hugepage(gfp, info, index);
+ else
+ page = shmem_alloc_page(gfp, info, index);
+ if (page) {
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ return page;
+ }
+
+ err = -ENOMEM;
+ shmem_inode_unacct_blocks(inode, nr);
+failed:
+ return ERR_PTR(err);
+}
+
+/*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to. If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+ return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct page *oldpage, *newpage;
+ struct address_space *swap_mapping;
+ swp_entry_t entry;
+ pgoff_t swap_index;
+ int error;
+
+ oldpage = *pagep;
+ entry.val = page_private(oldpage);
+ swap_index = swp_offset(entry);
+ swap_mapping = page_mapping(oldpage);
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success by further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ newpage = shmem_alloc_page(gfp, info, index);
+ if (!newpage)
+ return -ENOMEM;
+
+ get_page(newpage);
+ copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
+
+ __SetPageLocked(newpage);
+ __SetPageSwapBacked(newpage);
+ SetPageUptodate(newpage);
+ set_page_private(newpage, entry.val);
+ SetPageSwapCache(newpage);
+
+ /*
+ * Our caller will very soon move newpage out of swapcache, but it's
+ * a nice clean interface for us to replace oldpage by newpage there.
+ */
+ xa_lock_irq(&swap_mapping->i_pages);
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
+ if (!error) {
+ mem_cgroup_migrate(oldpage, newpage);
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
+ }
+ xa_unlock_irq(&swap_mapping->i_pages);
+
+ if (unlikely(error)) {
+ /*
+ * Is this possible? I think not, now that our callers check
+ * both PageSwapCache and page_private after getting page lock;
+ * but be defensive. Reverse old to newpage for clear and free.
+ */
+ oldpage = newpage;
+ } else {
+ lru_cache_add(newpage);
+ *pagep = newpage;
+ }
+
+ ClearPageSwapCache(oldpage);
+ set_page_private(oldpage, 0);
+
+ unlock_page(oldpage);
+ put_page(oldpage);
+ put_page(oldpage);
+ return error;
+}
+
+/*
+ * Swap in the page pointed to by *pagep.
+ * Caller has to make sure that *pagep contains a valid swapped page.
+ * Returns 0 and the page in pagep if success. On failure, returns the
+ * error code and NULL in *pagep.
+ */
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+ struct page *page;
+ swp_entry_t swap;
+ int error;
+
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
+ swap = radix_to_swp_entry(*pagep);
+ *pagep = NULL;
+
+ /* Look it up and read it in.. */
+ page = lookup_swap_cache(swap, NULL, 0);
+ if (!page) {
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
+ *fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ /* We have to do this with page locked to prevent races */
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST;
+ goto unlock;
+ }
+ if (!PageUptodate(page)) {
+ error = -EIO;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ /*
+ * Some architectures may have to restore extra metadata to the
+ * physical page after reading from swap.
+ */
+ arch_swap_restore(swap, page);
+
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
+ }
+
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
+ if (error)
+ goto failed;
+
+ spin_lock_irq(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ *pagep = page;
+ return 0;
+failed:
+ if (!shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return error;
+}
+
+/*
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
+ *
+ * If we allocate a new one we do not mark it dirty. That's up to the
+ * vm. If we swap it in we mark it dirty since we also free the swap
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * vmf and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
+ */
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo;
+ struct mm_struct *charge_mm;
+ struct page *page;
+ enum sgp_type sgp_huge = sgp;
+ pgoff_t hindex = index;
+ int error;
+ int once = 0;
+ int alloced = 0;
+
+ if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
+ return -EFBIG;
+ if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
+ sgp = SGP_CACHE;
+repeat:
+ if (sgp <= SGP_CACHE &&
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ return -EINVAL;
+ }
+
+ sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = vma ? vma->vm_mm : current->mm;
+
+ page = find_lock_entry(mapping, index);
+ if (xa_is_value(page)) {
+ error = shmem_swapin_page(inode, index, &page,
+ sgp, gfp, vma, fault_type);
+ if (error == -EEXIST)
+ goto repeat;
+
+ *pagep = page;
+ return error;
+ }
+
+ if (page)
+ hindex = page->index;
+ if (page && sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ /* fallocated page? */
+ if (page && !PageUptodate(page)) {
+ if (sgp != SGP_READ)
+ goto clear;
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ hindex = index;
+ }
+ if (page || sgp == SGP_READ)
+ goto out;
+
+ /*
+ * Fast cache lookup did not find it:
+ * bring it back from swap or allocate.
+ */
+
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
+
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE: {
+ loff_t i_size;
+ pgoff_t off;
+
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ goto alloc_huge;
+
+ fallthrough;
+ }
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
+ }
+
+alloc_huge:
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge:
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
+
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto unlock;
+ /*
+ * Try to reclaim some space by splitting a huge page
+ * beyond i_size on the filesystem.
+ */
+ while (retry--) {
+ int ret;
+
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
+ }
+ goto unlock;
+ }
+
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
+
+ if (sgp == SGP_WRITE)
+ __SetPageReferenced(page);
+
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
+ if (error)
+ goto unacct;
+ lru_cache_add(page);
+
+ spin_lock_irq(&info->lock);
+ info->alloced += compound_nr(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ alloced = true;
+
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
+ /*
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
+clear:
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ int i;
+
+ for (i = 0; i < compound_nr(page); i++) {
+ clear_highpage(page + i);
+ flush_dcache_page(page + i);
+ }
+ SetPageUptodate(page);
+ }
+
+ /* Perhaps the file has been truncated since we checked */
+ if (sgp <= SGP_CACHE &&
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ if (alloced) {
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+ spin_lock_irq(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ }
+ error = -EINVAL;
+ goto unlock;
+ }
+out:
+ *pagep = page + index - hindex;
+ return 0;
+
+ /*
+ * Error recovery.
+ */
+unacct:
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
+
+ if (PageTransHuge(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto alloc_nohuge;
+ }
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ if (error == -ENOSPC && !once++) {
+ spin_lock_irq(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ goto repeat;
+ }
+ if (error == -EEXIST)
+ goto repeat;
+ return error;
+}
+
+/*
+ * This is like autoremove_wake_function, but it removes the wait queue
+ * entry unconditionally - even if something else had already woken the
+ * target.
+ */
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+ list_del_init(&wait->entry);
+ return ret;
+}
+
+static vm_fault_t shmem_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = file_inode(vma->vm_file);
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
+ enum sgp_type sgp;
+ int err;
+ vm_fault_t ret = VM_FAULT_LOCKED;
+
+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_mutex in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
+
+ ret = VM_FAULT_NOPAGE;
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
+ ret = VM_FAULT_RETRY;
+
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
+ return ret;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
+ sgp = SGP_CACHE;
+
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ sgp = SGP_NOHUGE;
+ else if (vma->vm_flags & VM_HUGEPAGE)
+ sgp = SGP_HUGE;
+
+ err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ gfp, vma, vmf, &ret);
+ if (err)
+ return vmf_error(err);
+ return ret;
+}
+
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long uaddr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ unsigned long (*get_area)(struct file *,
+ unsigned long, unsigned long, unsigned long, unsigned long);
+ unsigned long addr;
+ unsigned long offset;
+ unsigned long inflated_len;
+ unsigned long inflated_addr;
+ unsigned long inflated_offset;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ get_area = current->mm->get_unmapped_area;
+ addr = get_area(file, uaddr, len, pgoff, flags);
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return addr;
+ if (IS_ERR_VALUE(addr))
+ return addr;
+ if (addr & ~PAGE_MASK)
+ return addr;
+ if (addr > TASK_SIZE - len)
+ return addr;
+
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return addr;
+ if (len < HPAGE_PMD_SIZE)
+ return addr;
+ if (flags & MAP_FIXED)
+ return addr;
+ /*
+ * Our priority is to support MAP_SHARED mapped hugely;
+ * and support MAP_PRIVATE mapped hugely too, until it is COWed.
+ * But if caller specified an address hint and we allocated area there
+ * successfully, respect that as before.
+ */
+ if (uaddr == addr)
+ return addr;
+
+ if (shmem_huge != SHMEM_HUGE_FORCE) {
+ struct super_block *sb;
+
+ if (file) {
+ VM_BUG_ON(file->f_op != &shmem_file_operations);
+ sb = file_inode(file)->i_sb;
+ } else {
+ /*
+ * Called directly from mm/mmap.c, or drivers/char/mem.c
+ * for "/dev/zero", to create a shared anonymous object.
+ */
+ if (IS_ERR(shm_mnt))
+ return addr;
+ sb = shm_mnt->mnt_sb;
+ }
+ if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
+ return addr;
+ }
+
+ offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
+ if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
+ return addr;
+ if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
+ return addr;
+
+ inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
+ if (inflated_len > TASK_SIZE)
+ return addr;
+ if (inflated_len < len)
+ return addr;
+
+ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
+ if (IS_ERR_VALUE(inflated_addr))
+ return addr;
+ if (inflated_addr & ~PAGE_MASK)
+ return addr;
+
+ inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
+ inflated_addr += offset - inflated_offset;
+ if (inflated_offset > offset)
+ inflated_addr += HPAGE_PMD_SIZE;
+
+ if (inflated_addr > TASK_SIZE - len)
+ return addr;
+ return inflated_addr;
+}
+
+#ifdef CONFIG_NUMA
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ pgoff_t index;
+
+ index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
+}
+#endif
+
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int retval = -ENOMEM;
+
+ /*
+ * What serializes the accesses to info->flags?
+ * ipc_lock_object() when called from shmctl_do_lock(),
+ * no serialization needed when called from shm_destroy().
+ */
+ if (lock && !(info->flags & VM_LOCKED)) {
+ if (!user_shm_lock(inode->i_size, user))
+ goto out_nomem;
+ info->flags |= VM_LOCKED;
+ mapping_set_unevictable(file->f_mapping);
+ }
+ if (!lock && (info->flags & VM_LOCKED) && user) {
+ user_shm_unlock(inode->i_size, user);
+ info->flags &= ~VM_LOCKED;
+ mapping_clear_unevictable(file->f_mapping);
+ }
+ retval = 0;
+
+out_nomem:
+ return retval;
+}
+
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
+
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
+
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
+
+ file_accessed(file);
+ vma->vm_ops = &shmem_vm_ops;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
+ (vma->vm_end & HPAGE_PMD_MASK)) {
+ khugepaged_enter(vma, vma->vm_flags);
+ }
+ return 0;
+}
+
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
+{
+ struct inode *inode;
+ struct shmem_inode_info *info;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
+
+ if (shmem_reserve_inode(sb, &ino))
+ return NULL;
+
+ inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = ino;
+ inode_init_owner(inode, dir, mode);
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_generation = prandom_u32();
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
+ info->seals = F_SEAL_SEAL;
+ info->flags = flags & VM_NORESERVE;
+ INIT_LIST_HEAD(&info->shrinklist);
+ INIT_LIST_HEAD(&info->swaplist);
+ simple_xattrs_init(&info->xattrs);
+ cache_no_acl(inode);
+
+ switch (mode & S_IFMT) {
+ default:
+ inode->i_op = &shmem_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ break;
+ case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_inode_operations;
+ inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * BOGO_DIRENT_SIZE;
+ inode->i_op = &shmem_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ break;
+ case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy, NULL);
+ break;
+ }
+
+ lockdep_annotate_inode_mutex_key(inode);
+ } else
+ shmem_free_inode(sb);
+ return inode;
+}
+
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->a_ops == &shmem_aops;
+}
+
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ spinlock_t *ptl;
+ void *page_kaddr;
+ struct page *page;
+ pte_t _dst_pte, *dst_pte;
+ int ret;
+ pgoff_t offset, max_off;
+
+ ret = -ENOMEM;
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
+ goto out;
+ }
+
+ if (!*pagep) {
+ page = shmem_alloc_page(gfp, info, pgoff);
+ if (!page)
+ goto out_unacct_blocks;
+
+ if (!zeropage) { /* mcopy_atomic */
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *)src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_lock */
+ if (unlikely(ret)) {
+ *pagep = page;
+ shmem_inode_unacct_blocks(inode, 1);
+ /* don't free the page */
+ return -ENOENT;
+ }
+ } else { /* mfill_zeropage_atomic */
+ clear_highpage(page);
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ __SetPageUptodate(page);
+
+ ret = -EFAULT;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ goto out_release;
+
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK, dst_mm);
+ if (ret)
+ goto out_release;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+ else {
+ /*
+ * We don't set the pte dirty if the vma has no
+ * VM_WRITE permission, so mark the page dirty or it
+ * could be freed from under us. We could do it
+ * unconditionally before unlock_page(), but doing it
+ * only if VM_WRITE is not set is faster.
+ */
+ set_page_dirty(page);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ ret = -EFAULT;
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
+ goto out_release_unlock;
+
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_release_unlock;
+
+ lru_cache_add(page);
+
+ spin_lock_irq(&info->lock);
+ info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ inc_mm_counter(dst_mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ pte_unmap_unlock(dst_pte, ptl);
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+out_release:
+ unlock_page(page);
+ put_page(page);
+out_unacct_blocks:
+ shmem_inode_unacct_blocks(inode, 1);
+ goto out;
+}
+
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, false, pagep);
+}
+
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct page *page = NULL;
+
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, 0, true, &page);
+}
+
+#ifdef CONFIG_TMPFS
+static const struct inode_operations shmem_symlink_inode_operations;
+static const struct inode_operations shmem_short_symlink_operations;
+
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#else
+#define shmem_initxattrs NULL
+#endif
+
+static int
+shmem_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t index = pos >> PAGE_SHIFT;
+
+ /* i_mutex is held by caller */
+ if (unlikely(info->seals & (F_SEAL_GROW |
+ F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
+ return -EPERM;
+ if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
+ return -EPERM;
+ }
+
+ return shmem_getpage(inode, index, pagep, SGP_WRITE);
+}
+
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
+
+ if (!PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ if (PageTransCompound(page)) {
+ int i;
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (head + i == page)
+ continue;
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
+ }
+ }
+ if (copied < PAGE_SIZE) {
+ unsigned from = pos & (PAGE_SIZE - 1);
+ zero_user_segments(page, 0, from,
+ from + copied, PAGE_SIZE);
+ }
+ SetPageUptodate(head);
+ }
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+
+ return copied;
+}
+
+static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ unsigned long offset;
+ enum sgp_type sgp = SGP_READ;
+ int error = 0;
+ ssize_t retval = 0;
+ loff_t *ppos = &iocb->ki_pos;
+
+ /*
+ * Might this read be for a stacking filesystem? Then when reading
+ * holes of a sparse file, we actually need to allocate those pages,
+ * and even mark them dirty, so it cannot exceed the max_blocks limit.
+ */
+ if (!iter_is_iovec(to))
+ sgp = SGP_CACHE;
+
+ index = *ppos >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
+
+ for (;;) {
+ struct page *page = NULL;
+ pgoff_t end_index;
+ unsigned long nr, ret;
+ loff_t i_size = i_size_read(inode);
+
+ end_index = i_size >> PAGE_SHIFT;
+ if (index > end_index)
+ break;
+ if (index == end_index) {
+ nr = i_size & ~PAGE_MASK;
+ if (nr <= offset)
+ break;
+ }
+
+ error = shmem_getpage(inode, index, &page, sgp);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
+ break;
+ }
+ if (page) {
+ if (sgp == SGP_CACHE)
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * We must evaluate after, since reads (unlike writes)
+ * are called without i_mutex protection against truncate
+ */
+ nr = PAGE_SIZE;
+ i_size = i_size_read(inode);
+ end_index = i_size >> PAGE_SHIFT;
+ if (index == end_index) {
+ nr = i_size & ~PAGE_MASK;
+ if (nr <= offset) {
+ if (page)
+ put_page(page);
+ break;
+ }
+ }
+ nr -= offset;
+
+ if (page) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+ /*
+ * Mark the page accessed if we read the beginning.
+ */
+ if (!offset)
+ mark_page_accessed(page);
+ } else {
+ page = ZERO_PAGE(0);
+ get_page(page);
+ }
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
+ ret = copy_page_to_iter(page, offset, nr, to);
+ retval += ret;
+ offset += ret;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
+
+ put_page(page);
+ if (!iov_iter_count(to))
+ break;
+ if (ret < nr) {
+ error = -EFAULT;
+ break;
+ }
+ cond_resched();
+ }
+
+ *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
+ file_accessed(file);
+ return retval ? retval : error;
+}
+
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pgoff_t index, pgoff_t end, int whence)
+{
+ struct page *page;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool done = false;
+ int i;
+
+ pagevec_init(&pvec);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+ pvec.nr = find_get_entries(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (whence == SEEK_DATA)
+ index = end;
+ break;
+ }
+ for (i = 0; i < pvec.nr; i++, index++) {
+ if (index < indices[i]) {
+ if (whence == SEEK_HOLE) {
+ done = true;
+ break;
+ }
+ index = indices[i];
+ }
+ page = pvec.pages[i];
+ if (page && !xa_is_value(page)) {
+ if (!PageUptodate(page))
+ page = NULL;
+ }
+ if (index >= end ||
+ (page && whence == SEEK_DATA) ||
+ (!page && whence == SEEK_HOLE)) {
+ done = true;
+ break;
+ }
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+ }
+ return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t start, end;
+ loff_t new_offset;
+
+ if (whence != SEEK_DATA && whence != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, whence,
+ MAX_LFS_FILESIZE, i_size_read(inode));
+ inode_lock(inode);
+ /* We're holding i_mutex so we can access i_size directly */
+
+ if (offset < 0 || offset >= inode->i_size)
+ offset = -ENXIO;
+ else {
+ start = offset >> PAGE_SHIFT;
+ end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ new_offset = shmem_seek_hole_data(mapping, start, end, whence);
+ new_offset <<= PAGE_SHIFT;
+ if (new_offset > offset) {
+ if (new_offset < inode->i_size)
+ offset = new_offset;
+ else if (whence == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ offset = inode->i_size;
+ }
+ }
+
+ if (offset >= 0)
+ offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
+ inode_unlock(inode);
+ return offset;
+}
+
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_falloc shmem_falloc;
+ pgoff_t start, index, end;
+ int error;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ struct address_space *mapping = file->f_mapping;
+ loff_t unmap_start = round_up(offset, PAGE_SIZE);
+ loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+ /* protected by i_mutex */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ error = -EPERM;
+ goto out;
+ }
+
+ shmem_falloc.waitq = &shmem_falloc_waitq;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ shmem_truncate_range(inode, offset, offset + len - 1);
+ /* No need to unmap again: hole-punching leaves COWed pages */
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&shmem_falloc_waitq);
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
+ spin_unlock(&inode->i_lock);
+ error = 0;
+ goto out;
+ }
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
+ error = -EPERM;
+ goto out;
+ }
+
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* Try to avoid a swapstorm if len is impossible to satisfy */
+ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ shmem_falloc.waitq = NULL;
+ shmem_falloc.start = start;
+ shmem_falloc.next = start;
+ shmem_falloc.nr_falloced = 0;
+ shmem_falloc.nr_unswapped = 0;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
+ for (index = start; index < end; index++) {
+ struct page *page;
+
+ /*
+ * Good, the fallocate(2) manpage permits EINTR: we may have
+ * been interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current))
+ error = -EINTR;
+ else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+ error = -ENOMEM;
+ else
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC);
+ if (error) {
+ /* Remove the !PageUptodate pages we added */
+ if (index > start) {
+ shmem_undo_range(inode,
+ (loff_t)start << PAGE_SHIFT,
+ ((loff_t)index << PAGE_SHIFT) - 1, true);
+ }
+ goto undone;
+ }
+
+ /*
+ * Inform shmem_writepage() how far we have reached.
+ * No need for lock or barrier: we have the page lock.
+ */
+ shmem_falloc.next++;
+ if (!PageUptodate(page))
+ shmem_falloc.nr_falloced++;
+
+ /*
+ * If !PageUptodate, leave it that way so that freeable pages
+ * can be recognized if we need to rollback on error later.
+ * But set_page_dirty so that memory pressure will swap rather
+ * than free the pages we are allocating (and SGP_CACHE pages
+ * might still be clean: we now need to mark those dirty too).
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ cond_resched();
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = current_time(inode);
+undone:
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ inode_unlock(inode);
+ return error;
+}
+
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
+
+ buf->f_type = TMPFS_MAGIC;
+ buf->f_bsize = PAGE_SIZE;
+ buf->f_namelen = NAME_MAX;
+ if (sbinfo->max_blocks) {
+ buf->f_blocks = sbinfo->max_blocks;
+ buf->f_bavail =
+ buf->f_bfree = sbinfo->max_blocks -
+ percpu_counter_sum(&sbinfo->used_blocks);
+ }
+ if (sbinfo->max_inodes) {
+ buf->f_files = sbinfo->max_inodes;
+ buf->f_ffree = sbinfo->free_inodes;
+ }
+ /* else leave those fields 0 like simple_statfs */
+ return 0;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+static int
+shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ struct inode *inode;
+ int error = -ENOSPC;
+
+ inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
+ if (inode) {
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = 0;
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry); /* Extra count - pin the dentry in core */
+ }
+ return error;
+out_iput:
+ iput(inode);
+ return error;
+}
+
+static int
+shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+ int error = -ENOSPC;
+
+ inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ if (inode) {
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ d_tmpfile(dentry, inode);
+ }
+ return error;
+out_iput:
+ iput(inode);
+ return error;
+}
+
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int error;
+
+ if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ return error;
+ inc_nlink(dir);
+ return 0;
+}
+
+static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+/*
+ * Link a file..
+ */
+static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int ret = 0;
+
+ /*
+ * No ordinary (disk based) filesystem counts links as inodes;
+ * but each new link needs a new dentry, pinning lowmem, and
+ * tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
+ */
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
+ if (ret)
+ goto out;
+ }
+
+ dir->i_size += BOGO_DIRENT_SIZE;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inc_nlink(inode);
+ ihold(inode); /* New dentry reference */
+ dget(dentry); /* Extra pinning count for the created dentry */
+ d_instantiate(dentry, inode);
+out:
+ return ret;
+}
+
+static int shmem_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+ shmem_free_inode(inode->i_sb);
+
+ dir->i_size -= BOGO_DIRENT_SIZE;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ drop_nlink(inode);
+ dput(dentry); /* Undo the count from "create" - this does all the work */
+ return 0;
+}
+
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ if (!simple_empty(dentry))
+ return -ENOTEMPTY;
+
+ drop_nlink(d_inode(dentry));
+ drop_nlink(dir);
+ return shmem_unlink(dir, dentry);
+}
+
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+ bool old_is_dir = d_is_dir(old_dentry);
+ bool new_is_dir = d_is_dir(new_dentry);
+
+ if (old_dir != new_dir && old_is_dir != new_is_dir) {
+ if (old_is_dir) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ } else {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ d_inode(old_dentry)->i_ctime =
+ d_inode(new_dentry)->i_ctime = current_time(old_dir);
+
+ return 0;
+}
+
+static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+{
+ struct dentry *whiteout;
+ int error;
+
+ whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
+ if (!whiteout)
+ return -ENOMEM;
+
+ error = shmem_mknod(old_dir, whiteout,
+ S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+ dput(whiteout);
+ if (error)
+ return error;
+
+ /*
+ * Cheat and hash the whiteout while the old dentry is still in
+ * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
+ *
+ * d_lookup() will consistently find one of them at this point,
+ * not sure which one, but that isn't even important.
+ */
+ d_rehash(whiteout);
+ return 0;
+}
+
+/*
+ * The VFS layer already does all the dentry stuff for rename,
+ * we just have to decrement the usage count for the target if
+ * it exists so that the VFS layer correctly free's it when it
+ * gets overwritten.
+ */
+static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+{
+ struct inode *inode = d_inode(old_dentry);
+ int they_are_dirs = S_ISDIR(inode->i_mode);
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE)
+ return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
+ if (!simple_empty(new_dentry))
+ return -ENOTEMPTY;
+
+ if (flags & RENAME_WHITEOUT) {
+ int error;
+
+ error = shmem_whiteout(old_dir, old_dentry);
+ if (error)
+ return error;
+ }
+
+ if (d_really_is_positive(new_dentry)) {
+ (void) shmem_unlink(new_dir, new_dentry);
+ if (they_are_dirs) {
+ drop_nlink(d_inode(new_dentry));
+ drop_nlink(old_dir);
+ }
+ } else if (they_are_dirs) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ }
+
+ old_dir->i_size -= BOGO_DIRENT_SIZE;
+ new_dir->i_size += BOGO_DIRENT_SIZE;
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ inode->i_ctime = current_time(old_dir);
+ return 0;
+}
+
+static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+ int error;
+ int len;
+ struct inode *inode;
+ struct page *page;
+
+ len = strlen(symname) + 1;
+ if (len > PAGE_SIZE)
+ return -ENAMETOOLONG;
+
+ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
+ VM_NORESERVE);
+ if (!inode)
+ return -ENOSPC;
+
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
+ }
+
+ inode->i_size = len-1;
+ if (len <= SHORT_SYMLINK_LEN) {
+ inode->i_link = kmemdup(symname, len, GFP_KERNEL);
+ if (!inode->i_link) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ inode->i_op = &shmem_short_symlink_operations;
+ } else {
+ inode_nohighmem(inode);
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ if (error) {
+ iput(inode);
+ return error;
+ }
+ inode->i_mapping->a_ops = &shmem_aops;
+ inode->i_op = &shmem_symlink_inode_operations;
+ memcpy(page_address(page), symname, len);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ dir->i_size += BOGO_DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ return 0;
+}
+
+static void shmem_put_link(void *arg)
+{
+ mark_page_accessed(arg);
+ put_page(arg);
+}
+
+static const char *shmem_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct page *page = NULL;
+ int error;
+ if (!dentry) {
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ error = shmem_getpage(inode, 0, &page, SGP_READ);
+ if (error)
+ return ERR_PTR(error);
+ unlock_page(page);
+ }
+ set_delayed_call(done, shmem_put_link, page);
+ return page_address(page);
+}
+
+#ifdef CONFIG_TMPFS_XATTR
+/*
+ * Superblocks without xattr inode operations may get some security.* xattr
+ * support from the LSM "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+/*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+static int shmem_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ const struct xattr *xattr;
+ struct simple_xattr *new_xattr;
+ size_t len;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ len = strlen(xattr->name) + 1;
+ new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+ GFP_KERNEL);
+ if (!new_xattr->name) {
+ kvfree(new_xattr);
+ return -ENOMEM;
+ }
+
+ memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+ xattr->name, len);
+
+ simple_xattr_list_add(&info->xattrs, new_xattr);
+ }
+
+ return 0;
+}
+
+static int shmem_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ name = xattr_full_name(handler, name);
+ return simple_xattr_get(&info->xattrs, name, buffer, size);
+}
+
+static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ name = xattr_full_name(handler, name);
+ return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+}
+
+static const struct xattr_handler shmem_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
+
+static const struct xattr_handler shmem_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
+
+static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &shmem_security_xattr_handler,
+ &shmem_trusted_xattr_handler,
+ NULL
+};
+
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+ return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
+}
+#endif /* CONFIG_TMPFS_XATTR */
+
+static const struct inode_operations shmem_short_symlink_operations = {
+ .get_link = simple_get_link,
+#ifdef CONFIG_TMPFS_XATTR
+ .listxattr = shmem_listxattr,
+#endif
+};
+
+static const struct inode_operations shmem_symlink_inode_operations = {
+ .get_link = shmem_get_link,
+#ifdef CONFIG_TMPFS_XATTR
+ .listxattr = shmem_listxattr,
+#endif
+};
+
+static struct dentry *shmem_get_parent(struct dentry *child)
+{
+ return ERR_PTR(-ESTALE);
+}
+
+static int shmem_match(struct inode *ino, void *vfh)
+{
+ __u32 *fh = vfh;
+ __u64 inum = fh[2];
+ inum = (inum << 32) | fh[1];
+ return ino->i_ino == inum && fh[0] == ino->i_generation;
+}
+
+/* Find any alias of inode, but prefer a hashed alias */
+static struct dentry *shmem_find_alias(struct inode *inode)
+{
+ struct dentry *alias = d_find_alias(inode);
+
+ return alias ?: d_find_any_alias(inode);
+}
+
+
+static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct inode *inode;
+ struct dentry *dentry = NULL;
+ u64 inum;
+
+ if (fh_len < 3)
+ return NULL;
+
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
+ inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
+ shmem_match, fid->raw);
+ if (inode) {
+ dentry = shmem_find_alias(inode);
+ iput(inode);
+ }
+
+ return dentry;
+}
+
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+ struct inode *parent)
+{
+ if (*len < 3) {
+ *len = 3;
+ return FILEID_INVALID;
+ }
+
+ if (inode_unhashed(inode)) {
+ /* Unfortunately insert_inode_hash is not idempotent,
+ * so as we hash inodes here rather than at creation
+ * time, we need a lock to ensure we only try
+ * to do it once
+ */
+ static DEFINE_SPINLOCK(lock);
+ spin_lock(&lock);
+ if (inode_unhashed(inode))
+ __insert_inode_hash(inode,
+ inode->i_ino + inode->i_generation);
+ spin_unlock(&lock);
+ }
+
+ fh[0] = inode->i_generation;
+ fh[1] = inode->i_ino;
+ fh[2] = ((__u64)inode->i_ino) >> 32;
+
+ *len = 3;
+ return 1;
+}
+
+static const struct export_operations shmem_export_ops = {
+ .get_parent = shmem_get_parent,
+ .encode_fh = shmem_encode_fh,
+ .fh_to_dentry = shmem_fh_to_dentry,
+};
+
+enum shmem_param {
+ Opt_gid,
+ Opt_huge,
+ Opt_mode,
+ Opt_mpol,
+ Opt_nr_blocks,
+ Opt_nr_inodes,
+ Opt_size,
+ Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
+};
+
+static const struct constant_table shmem_param_enums_huge[] = {
+ {"never", SHMEM_HUGE_NEVER },
+ {"always", SHMEM_HUGE_ALWAYS },
+ {"within_size", SHMEM_HUGE_WITHIN_SIZE },
+ {"advise", SHMEM_HUGE_ADVISE },
+ {}
+};
+
+const struct fs_parameter_spec shmem_fs_parameters[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
+ fsparam_u32oct("mode", Opt_mode),
+ fsparam_string("mpol", Opt_mpol),
+ fsparam_string("nr_blocks", Opt_nr_blocks),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
+ {}
+};
+
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct shmem_options *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ unsigned long long size;
+ char *rest;
+ int opt;
+ kuid_t kuid;
+ kgid_t kgid;
+
+ opt = fs_parse(fc, shmem_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_size:
+ size = memparse(param->string, &rest);
+ if (*rest == '%') {
+ size <<= PAGE_SHIFT;
+ size *= totalram_pages();
+ do_div(size, 100);
+ rest++;
+ }
+ if (*rest)
+ goto bad_value;
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_blocks:
+ ctx->blocks = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_inodes:
+ ctx->inodes = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_INODES;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 07777;
+ break;
+ case Opt_uid:
+ kuid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(kuid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, kuid))
+ goto bad_value;
+
+ ctx->uid = kuid;
+ break;
+ case Opt_gid:
+ kgid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(kgid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, kgid))
+ goto bad_value;
+
+ ctx->gid = kgid;
+ break;
+ case Opt_huge:
+ ctx->huge = result.uint_32;
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ has_transparent_hugepage()))
+ goto unsupported_parameter;
+ ctx->seen |= SHMEM_SEEN_HUGE;
+ break;
+ case Opt_mpol:
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ mpol_put(ctx->mpol);
+ ctx->mpol = NULL;
+ if (mpol_parse_str(param->string, &ctx->mpol))
+ goto bad_value;
+ break;
+ }
+ goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ }
+ return 0;
+
+unsupported_parameter:
+ return invalfc(fc, "Unsupported parameter '%s'", param->key);
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
+}
+
+static int shmem_parse_options(struct fs_context *fc, void *data)
+{
+ char *options = data;
+
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
+
+ while (options != NULL) {
+ char *this_char = options;
+ for (;;) {
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ options = strchr(options, ',');
+ if (options == NULL)
+ break;
+ options++;
+ if (!isdigit(*options)) {
+ options[-1] = '\0';
+ break;
+ }
+ }
+ if (*this_char) {
+ char *value = strchr(this_char,'=');
+ size_t len = 0;
+ int err;
+
+ if (value) {
+ *value++ = '\0';
+ len = strlen(value);
+ }
+ err = vfs_parse_fs_string(fc, this_char, value, len);
+ if (err < 0)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Reconfigure a shmem filesystem.
+ *
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
+ * are in use; but we must separately disallow unlimited->limited, because in
+ * that case we have no record of how much is already in use.
+ */
+static int shmem_reconfigure(struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
+ unsigned long inodes;
+ const char *err;
+
+ spin_lock(&sbinfo->stat_lock);
+ inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
+ if (!sbinfo->max_blocks) {
+ err = "Cannot retroactively limit size";
+ goto out;
+ }
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ ctx->blocks) > 0) {
+ err = "Too small a size for current use";
+ goto out;
+ }
+ }
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
+ if (!sbinfo->max_inodes) {
+ err = "Cannot retroactively limit inodes";
+ goto out;
+ }
+ if (ctx->inodes < inodes) {
+ err = "Too few inodes for current use";
+ goto out;
+ }
+ }
+
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
+ sbinfo->max_blocks = ctx->blocks;
+ if (ctx->seen & SHMEM_SEEN_INODES) {
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_inodes = ctx->inodes - inodes;
+ }
+
+ /*
+ * Preserve previous mempolicy unless mpol remount option was specified.
+ */
+ if (ctx->mpol) {
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
+ ctx->mpol = NULL;
+ }
+ spin_unlock(&sbinfo->stat_lock);
+ return 0;
+out:
+ spin_unlock(&sbinfo->stat_lock);
+ return invalfc(fc, "%s", err);
+}
+
+static int shmem_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+
+ if (sbinfo->max_blocks != shmem_default_max_blocks())
+ seq_printf(seq, ",size=%luk",
+ sbinfo->max_blocks << (PAGE_SHIFT - 10));
+ if (sbinfo->max_inodes != shmem_default_max_inodes())
+ seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+ if (sbinfo->mode != (0777 | S_ISVTX))
+ seq_printf(seq, ",mode=%03ho", sbinfo->mode);
+ if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",uid=%u",
+ from_kuid_munged(&init_user_ns, sbinfo->uid));
+ if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u",
+ from_kgid_munged(&init_user_ns, sbinfo->gid));
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
+ if (sbinfo->huge)
+ seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+#endif
+ shmem_show_mpol(seq, sbinfo->mpol);
+ return 0;
+}
+
+#endif /* CONFIG_TMPFS */
+
+static void shmem_put_super(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ free_percpu(sbinfo->ino_batch);
+ percpu_counter_destroy(&sbinfo->used_blocks);
+ mpol_put(sbinfo->mpol);
+ kfree(sbinfo);
+ sb->s_fs_info = NULL;
+}
+
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+ struct inode *inode;
+ struct shmem_sb_info *sbinfo;
+ int err = -ENOMEM;
+
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbinfo;
+
+#ifdef CONFIG_TMPFS
+ /*
+ * Per default we only allow half of the physical ram per
+ * tmpfs instance, limiting inodes to one per page of lowmem;
+ * but the internal instance is left unlimited.
+ */
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
+ ctx->blocks = shmem_default_max_blocks();
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
+ ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+ } else {
+ sb->s_flags |= SB_NOUSER;
+ }
+ sb->s_export_op = &shmem_export_ops;
+ sb->s_flags |= SB_NOSEC;
+#else
+ sb->s_flags |= SB_NOUSER;
+#endif
+ sbinfo->max_blocks = ctx->blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
+ sbinfo->mode = ctx->mode;
+ sbinfo->huge = ctx->huge;
+ sbinfo->mpol = ctx->mpol;
+ ctx->mpol = NULL;
+
+ spin_lock_init(&sbinfo->stat_lock);
+ if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
+ goto failed;
+ spin_lock_init(&sbinfo->shrinklist_lock);
+ INIT_LIST_HEAD(&sbinfo->shrinklist);
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_magic = TMPFS_MAGIC;
+ sb->s_op = &shmem_ops;
+ sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_XATTR
+ sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ sb->s_flags |= SB_POSIXACL;
+#endif
+ uuid_gen(&sb->s_uuid);
+
+ inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+ if (!inode)
+ goto failed;
+ inode->i_uid = sbinfo->uid;
+ inode->i_gid = sbinfo->gid;
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ goto failed;
+ return 0;
+
+failed:
+ shmem_put_super(sb);
+ return err;
+}
+
+static int shmem_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, shmem_fill_super);
+}
+
+static void shmem_free_fc(struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+
+ if (ctx) {
+ mpol_put(ctx->mpol);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations shmem_fs_context_ops = {
+ .free = shmem_free_fc,
+ .get_tree = shmem_get_tree,
+#ifdef CONFIG_TMPFS
+ .parse_monolithic = shmem_parse_options,
+ .parse_param = shmem_parse_one,
+ .reconfigure = shmem_reconfigure,
+#endif
+};
+
+static struct kmem_cache *shmem_inode_cachep;
+
+static struct inode *shmem_alloc_inode(struct super_block *sb)
+{
+ struct shmem_inode_info *info;
+ info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ if (!info)
+ return NULL;
+ return &info->vfs_inode;
+}
+
+static void shmem_free_in_core_inode(struct inode *inode)
+{
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
+static void shmem_destroy_inode(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+}
+
+static void shmem_init_inode(void *foo)
+{
+ struct shmem_inode_info *info = foo;
+ inode_init_once(&info->vfs_inode);
+}
+
+static void shmem_init_inodecache(void)
+{
+ shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
+ sizeof(struct shmem_inode_info),
+ 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
+}
+
+static void shmem_destroy_inodecache(void)
+{
+ kmem_cache_destroy(shmem_inode_cachep);
+}
+
+static const struct address_space_operations shmem_aops = {
+ .writepage = shmem_writepage,
+ .set_page_dirty = __set_page_dirty_no_writeback,
+#ifdef CONFIG_TMPFS
+ .write_begin = shmem_write_begin,
+ .write_end = shmem_write_end,
+#endif
+#ifdef CONFIG_MIGRATION
+ .migratepage = migrate_page,
+#endif
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct file_operations shmem_file_operations = {
+ .mmap = shmem_mmap,
+ .get_unmapped_area = shmem_get_unmapped_area,
+#ifdef CONFIG_TMPFS
+ .llseek = shmem_file_llseek,
+ .read_iter = shmem_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .fsync = noop_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = shmem_fallocate,
+#endif
+};
+
+static const struct inode_operations shmem_inode_operations = {
+ .getattr = shmem_getattr,
+ .setattr = shmem_setattr,
+#ifdef CONFIG_TMPFS_XATTR
+ .listxattr = shmem_listxattr,
+ .set_acl = simple_set_acl,
+#endif
+};
+
+static const struct inode_operations shmem_dir_inode_operations = {
+#ifdef CONFIG_TMPFS
+ .create = shmem_create,
+ .lookup = simple_lookup,
+ .link = shmem_link,
+ .unlink = shmem_unlink,
+ .symlink = shmem_symlink,
+ .mkdir = shmem_mkdir,
+ .rmdir = shmem_rmdir,
+ .mknod = shmem_mknod,
+ .rename = shmem_rename2,
+ .tmpfile = shmem_tmpfile,
+#endif
+#ifdef CONFIG_TMPFS_XATTR
+ .listxattr = shmem_listxattr,
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
+#endif
+};
+
+static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+ .listxattr = shmem_listxattr,
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
+#endif
+};
+
+static const struct super_operations shmem_ops = {
+ .alloc_inode = shmem_alloc_inode,
+ .free_inode = shmem_free_in_core_inode,
+ .destroy_inode = shmem_destroy_inode,
+#ifdef CONFIG_TMPFS
+ .statfs = shmem_statfs,
+ .show_options = shmem_show_options,
+#endif
+ .evict_inode = shmem_evict_inode,
+ .drop_inode = generic_delete_inode,
+ .put_super = shmem_put_super,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ .nr_cached_objects = shmem_unused_huge_count,
+ .free_cached_objects = shmem_unused_huge_scan,
+#endif
+};
+
+static const struct vm_operations_struct shmem_vm_ops = {
+ .fault = shmem_fault,
+ .map_pages = filemap_map_pages,
+#ifdef CONFIG_NUMA
+ .set_policy = shmem_set_policy,
+ .get_policy = shmem_get_policy,
+#endif
+};
+
+int shmem_init_fs_context(struct fs_context *fc)
+{
+ struct shmem_options *ctx;
+
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mode = 0777 | S_ISVTX;
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+
+ fc->fs_private = ctx;
+ fc->ops = &shmem_fs_context_ops;
+ return 0;
+}
+
+static struct file_system_type shmem_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tmpfs",
+ .init_fs_context = shmem_init_fs_context,
+#ifdef CONFIG_TMPFS
+ .parameters = shmem_fs_parameters,
+#endif
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
+};
+
+int __init shmem_init(void)
+{
+ int error;
+
+ shmem_init_inodecache();
+
+ error = register_filesystem(&shmem_fs_type);
+ if (error) {
+ pr_err("Could not register tmpfs\n");
+ goto out2;
+ }
+
+ shm_mnt = kern_mount(&shmem_fs_type);
+ if (IS_ERR(shm_mnt)) {
+ error = PTR_ERR(shm_mnt);
+ pr_err("Could not kern_mount tmpfs\n");
+ goto out1;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
+ SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+ else
+ shmem_huge = 0; /* just in case it was patched */
+#endif
+ return 0;
+
+out1:
+ unregister_filesystem(&shmem_fs_type);
+out2:
+ shmem_destroy_inodecache();
+ shm_mnt = ERR_PTR(error);
+ return error;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
+static ssize_t shmem_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ static const int values[] = {
+ SHMEM_HUGE_ALWAYS,
+ SHMEM_HUGE_WITHIN_SIZE,
+ SHMEM_HUGE_ADVISE,
+ SHMEM_HUGE_NEVER,
+ SHMEM_HUGE_DENY,
+ SHMEM_HUGE_FORCE,
+ };
+ int i, count;
+
+ for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
+ const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+
+ count += sprintf(buf + count, fmt,
+ shmem_format_huge(values[i]));
+ }
+ buf[count - 1] = '\n';
+ return count;
+}
+
+static ssize_t shmem_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ char tmp[16];
+ int huge;
+
+ if (count + 1 > sizeof(tmp))
+ return -EINVAL;
+ memcpy(tmp, buf, count);
+ tmp[count] = '\0';
+ if (count && tmp[count - 1] == '\n')
+ tmp[count - 1] = '\0';
+
+ huge = shmem_parse_huge(tmp);
+ if (huge == -EINVAL)
+ return -EINVAL;
+ if (!has_transparent_hugepage() &&
+ huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
+ return -EINVAL;
+
+ shmem_huge = huge;
+ if (shmem_huge > SHMEM_HUGE_DENY)
+ SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
+ return count;
+}
+
+struct kobj_attribute shmem_enabled_attr =
+ __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ loff_t i_size;
+ pgoff_t off;
+
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
+ return false;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+ switch (sbinfo->huge) {
+ case SHMEM_HUGE_NEVER:
+ return false;
+ case SHMEM_HUGE_ALWAYS:
+ return true;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
+ return true;
+ fallthrough;
+ case SHMEM_HUGE_ADVISE:
+ /* TODO: implement fadvise() hints */
+ return (vma->vm_flags & VM_HUGEPAGE);
+ default:
+ VM_BUG_ON(1);
+ return false;
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#else /* !CONFIG_SHMEM */
+
+/*
+ * tiny-shmem: simple shmemfs and tmpfs using ramfs code
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+
+static struct file_system_type shmem_fs_type = {
+ .name = "tmpfs",
+ .init_fs_context = ramfs_init_fs_context,
+ .parameters = ramfs_fs_parameters,
+ .kill_sb = ramfs_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+int __init shmem_init(void)
+{
+ BUG_ON(register_filesystem(&shmem_fs_type) != 0);
+
+ shm_mnt = kern_mount(&shmem_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
+ return 0;
+}
+
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ return 0;
+}
+
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+ return 0;
+}
+
+void shmem_unlock_mapping(struct address_space *mapping)
+{
+}
+
+#ifdef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
+
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ truncate_inode_pages_range(inode->i_mapping, lstart, lend);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
+#define shmem_vm_ops generic_file_vm_ops
+#define shmem_file_operations ramfs_file_operations
+#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_acct_size(flags, size) 0
+#define shmem_unacct_size(flags, size) do {} while (0)
+
+#endif /* CONFIG_SHMEM */
+
+/* common code */
+
+static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
+ unsigned long flags, unsigned int i_flags)
+{
+ struct inode *inode;
+ struct file *res;
+
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ if (size < 0 || size > MAX_LFS_FILESIZE)
+ return ERR_PTR(-EINVAL);
+
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
+ inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
+ flags);
+ if (unlikely(!inode)) {
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(-ENOSPC);
+ }
+ inode->i_flags |= i_flags;
+ inode->i_size = size;
+ clear_nlink(inode); /* It is unlinked */
+ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ if (!IS_ERR(res))
+ res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
+ &shmem_file_operations);
+ if (IS_ERR(res))
+ iput(inode);
+ return res;
+}
+
+/**
+ * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
+ * kernel internal. There will be NO LSM permission checks against the
+ * underlying inode. So users of this interface must do LSM checks at a
+ * higher layer. The users are the big_key and shm implementations. LSM
+ * checks are provided at the key or shm level rather than the inode.
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
+}
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(shm_mnt, name, size, flags, 0);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup);
+
+/**
+ * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
+ * @mnt: the tmpfs mount where the file will be created
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
+ loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(mnt, name, size, flags, 0);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file;
+ loff_t size = vma->vm_end - vma->vm_start;
+
+ /*
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
+ * between XFS directory reading and selinux: since this file is only
+ * accessible to the user through its mapping, use S_PRIVATE flag to
+ * bypass file security, in the same way as shmem_kernel_file_setup().
+ */
+ file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
+ (vma->vm_end & HPAGE_PMD_MASK)) {
+ khugepaged_enter(vma, vma->vm_flags);
+ }
+
+ return 0;
+}
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+#ifdef CONFIG_SHMEM
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int error;
+
+ BUG_ON(mapping->a_ops != &shmem_aops);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ gfp, NULL, NULL, NULL);
+ if (error)
+ page = ERR_PTR(error);
+ else
+ unlock_page(page);
+ return page;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
+ return read_cache_page_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 000000000..9c2e145a7
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+
+static bool shuffle_param;
+static int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+ if (shuffle_param)
+ static_branch_enable(&page_alloc_shuffle_key);
+ return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
+{
+ struct page *page = pfn_to_online_page(pfn);
+
+ /*
+ * Given we're dealing with randomly selected pfns in a zone we
+ * need to ask questions like...
+ */
+
+ /* ... is the page managed by the buddy? */
+ if (!page)
+ return NULL;
+
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
+ return NULL;
+
+ /* ...is the page free and currently on a free_area list? */
+ if (!PageBuddy(page))
+ return NULL;
+
+ /*
+ * ...is the page on the same list as the page we will
+ * shuffle it with?
+ */
+ if (buddy_order(page) != order)
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+ unsigned long i, flags;
+ unsigned long start_pfn = z->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(z);
+ const int order = SHUFFLE_ORDER;
+ const int order_pages = 1 << order;
+
+ spin_lock_irqsave(&z->lock, flags);
+ start_pfn = ALIGN(start_pfn, order_pages);
+ for (i = start_pfn; i < end_pfn; i += order_pages) {
+ unsigned long j;
+ int migratetype, retry;
+ struct page *page_i, *page_j;
+
+ /*
+ * We expect page_i, in the sub-range of a zone being added
+ * (@start_pfn to @end_pfn), to more likely be valid compared to
+ * page_j randomly selected in the span @zone_start_pfn to
+ * @spanned_pages.
+ */
+ page_i = shuffle_valid_page(z, i, order);
+ if (!page_i)
+ continue;
+
+ for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+ /*
+ * Pick a random order aligned page in the zone span as
+ * a swap target. If the selected pfn is a hole, retry
+ * up to SHUFFLE_RETRY attempts find a random valid pfn
+ * in the zone.
+ */
+ j = z->zone_start_pfn +
+ ALIGN_DOWN(get_random_long() % z->spanned_pages,
+ order_pages);
+ page_j = shuffle_valid_page(z, j, order);
+ if (page_j && page_j != page_i)
+ break;
+ }
+ if (retry >= SHUFFLE_RETRY) {
+ pr_debug("%s: failed to swap %#lx\n", __func__, i);
+ continue;
+ }
+
+ /*
+ * Each migratetype corresponds to its own list, make sure the
+ * types match otherwise we're moving pages to lists where they
+ * do not belong.
+ */
+ migratetype = get_pageblock_migratetype(page_i);
+ if (get_pageblock_migratetype(page_j) != migratetype) {
+ pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+ continue;
+ }
+
+ list_swap(&page_i->lru, &page_j->lru);
+
+ pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+ /* take it easy on the zone lock */
+ if ((i % (100 * order_pages)) == 0) {
+ spin_unlock_irqrestore(&z->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ shuffle_zone(z);
+}
+
+bool shuffle_pick_tail(void)
+{
+ static u64 rand;
+ static u8 rand_bits;
+ bool ret;
+
+ /*
+ * The lack of locking is deliberate. If 2 threads race to
+ * update the rand state it just adds to the entropy.
+ */
+ if (rand_bits == 0) {
+ rand_bits = 64;
+ rand = get_random_u64();
+ }
+
+ ret = rand & 1;
+
+ rand_bits--;
+ rand >>= 1;
+
+ return ret;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 000000000..71b784f0b
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+extern bool shuffle_pick_tail(void);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_zone(z);
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return false;
+ return order >= SHUFFLE_ORDER;
+}
+#else
+static inline bool shuffle_pick_tail(void)
+{
+ return false;
+}
+
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ return false;
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
diff --git a/mm/slab.c b/mm/slab.c
new file mode 100644
index 000000000..b2cc2cf7d
--- /dev/null
+++ b/mm/slab.c
@@ -0,0 +1,4201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/slab.c
+ * Written by Mark Hemment, 1996/97.
+ * (markhe@nextd.demon.co.uk)
+ *
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
+ * Major cleanup, different bufctl logic, per-cpu arrays
+ * (c) 2000 Manfred Spraul
+ *
+ * Cleanup, make the head arrays unconditional, preparation for NUMA
+ * (c) 2002 Manfred Spraul
+ *
+ * An implementation of the Slab Allocator as described in outline in;
+ * UNIX Internals: The New Frontiers by Uresh Vahalia
+ * Pub: Prentice Hall ISBN 0-13-101908-2
+ * or with a little more detail in;
+ * The Slab Allocator: An Object-Caching Kernel Memory Allocator
+ * Jeff Bonwick (Sun Microsystems).
+ * Presented at: USENIX Summer 1994 Technical Conference
+ *
+ * The memory is organized in caches, one cache for each object type.
+ * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
+ * Each cache consists out of many slabs (they are small (usually one
+ * page long) and always contiguous), and each slab contains multiple
+ * initialized objects.
+ *
+ * This means, that your constructor is used only for newly allocated
+ * slabs and you must pass objects with the same initializations to
+ * kmem_cache_free.
+ *
+ * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
+ * normal). If you need a special memory type, then must create a new
+ * cache for that memory type.
+ *
+ * In order to reduce fragmentation, the slabs are sorted in 3 groups:
+ * full slabs with 0 free objects
+ * partial slabs
+ * empty slabs with no allocated objects
+ *
+ * If partial slabs exist, then new allocations come from these slabs,
+ * otherwise from empty slabs or new slabs are allocated.
+ *
+ * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
+ * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
+ *
+ * Each cache has a short per-cpu head array, most allocs
+ * and frees go into that array, and if that array overflows, then 1/2
+ * of the entries in the array are given back into the global cache.
+ * The head array is strictly LIFO and should improve the cache hit rates.
+ * On SMP, it additionally reduces the spinlock operations.
+ *
+ * The c_cpuarray may not be read with enabled local interrupts -
+ * it's changed with a smp_call_function().
+ *
+ * SMP synchronization:
+ * constructors and destructors are called without any locking.
+ * Several members in struct kmem_cache and struct slab never change, they
+ * are accessed without any locking.
+ * The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ * and local interrupts are disabled so slab code is preempt-safe.
+ * The non-constant members are protected with a per-cache irq spinlock.
+ *
+ * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
+ * in 2000 - many ideas in the current implementation are derived from
+ * his patch.
+ *
+ * Further notes from the original documentation:
+ *
+ * 11 April '97. Started multi-threading - markhe
+ * The global cache-chain is protected by the mutex 'slab_mutex'.
+ * The sem is only needed when accessing/extending the cache-chain, which
+ * can never happen inside an interrupt (kmem_cache_create(),
+ * kmem_cache_shrink() and kmem_cache_reap()).
+ *
+ * At present, each engine can be growing a cache. This should be blocked.
+ *
+ * 15 March 2005. NUMA slab allocator.
+ * Shai Fultheim <shai@scalex86.org>.
+ * Shobhit Dayal <shobhit@calsoftinc.com>
+ * Alok N Kataria <alokk@calsoftinc.com>
+ * Christoph Lameter <christoph@lameter.com>
+ *
+ * Modified the slab allocator to be node aware on NUMA systems.
+ * Each node has its own list of partial, free and full slabs.
+ * All object allocations for a node occur from node specific slab lists.
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/cpuset.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/cpu.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/nodemask.h>
+#include <linux/kmemleak.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex.h>
+#include <linux/fault-inject.h>
+#include <linux/rtmutex.h>
+#include <linux/reciprocal_div.h>
+#include <linux/debugobjects.h>
+#include <linux/memory.h>
+#include <linux/prefetch.h>
+#include <linux/sched/task_stack.h>
+
+#include <net/sock.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+
+#include <trace/events/kmem.h>
+
+#include "internal.h"
+
+#include "slab.h"
+
+/*
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS - 1 to collect stats for /proc/slabinfo.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define DEBUG 1
+#define STATS 1
+#define FORCED_DEBUG 1
+#else
+#define DEBUG 0
+#define STATS 0
+#define FORCED_DEBUG 0
+#endif
+
+/* Shouldn't this be in a header file somewhere? */
+#define BYTES_PER_WORD sizeof(void *)
+#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
+
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+
+#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
+ <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
+
+#if FREELIST_BYTE_INDEX
+typedef unsigned char freelist_idx_t;
+#else
+typedef unsigned short freelist_idx_t;
+#endif
+
+#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
+
+/*
+ * struct array_cache
+ *
+ * Purpose:
+ * - LIFO ordering, to hand out cache-warm objects from _alloc
+ * - reduce the number of linked list operations
+ * - reduce spinlock operations
+ *
+ * The limit is stored in the per-cpu structure to reduce the data cache
+ * footprint.
+ *
+ */
+struct array_cache {
+ unsigned int avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int touched;
+ void *entry[]; /*
+ * Must have this definition in here for the proper
+ * alignment of array_cache. Also simplifies accessing
+ * the entries.
+ */
+};
+
+struct alien_cache {
+ spinlock_t lock;
+ struct array_cache ac;
+};
+
+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
+static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
+#define CACHE_CACHE 0
+#define SIZE_NODE (MAX_NUMNODES)
+
+static int drain_freelist(struct kmem_cache *cache,
+ struct kmem_cache_node *n, int tofree);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+ int node, struct list_head *list);
+static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
+static void cache_reap(struct work_struct *unused);
+
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list);
+static int slab_early_init = 1;
+
+#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
+
+static void kmem_cache_node_init(struct kmem_cache_node *parent)
+{
+ INIT_LIST_HEAD(&parent->slabs_full);
+ INIT_LIST_HEAD(&parent->slabs_partial);
+ INIT_LIST_HEAD(&parent->slabs_free);
+ parent->total_slabs = 0;
+ parent->free_slabs = 0;
+ parent->shared = NULL;
+ parent->alien = NULL;
+ parent->colour_next = 0;
+ spin_lock_init(&parent->list_lock);
+ parent->free_objects = 0;
+ parent->free_touched = 0;
+}
+
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&get_node(cachep, nodeid)->slab, listp); \
+ } while (0)
+
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
+ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ } while (0)
+
+#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
+#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
+#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+
+#define BATCHREFILL_LIMIT 16
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
+ *
+ * OTOH the cpuarrays can contain lots of objects,
+ * which could lock up otherwise freeable slabs.
+ */
+#define REAPTIMEOUT_AC (2*HZ)
+#define REAPTIMEOUT_NODE (4*HZ)
+
+#if STATS
+#define STATS_INC_ACTIVE(x) ((x)->num_active++)
+#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
+#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
+#define STATS_INC_GROWN(x) ((x)->grown++)
+#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_SET_HIGH(x) \
+ do { \
+ if ((x)->num_active > (x)->high_mark) \
+ (x)->high_mark = (x)->num_active; \
+ } while (0)
+#define STATS_INC_ERR(x) ((x)->errors++)
+#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
+#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
+#define STATS_SET_FREEABLE(x, i) \
+ do { \
+ if ((x)->max_freeable < i) \
+ (x)->max_freeable = i; \
+ } while (0)
+#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
+#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
+#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
+#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
+#else
+#define STATS_INC_ACTIVE(x) do { } while (0)
+#define STATS_DEC_ACTIVE(x) do { } while (0)
+#define STATS_INC_ALLOCED(x) do { } while (0)
+#define STATS_INC_GROWN(x) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
+#define STATS_SET_HIGH(x) do { } while (0)
+#define STATS_INC_ERR(x) do { } while (0)
+#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_INC_NODEFREES(x) do { } while (0)
+#define STATS_INC_ACOVERFLOW(x) do { } while (0)
+#define STATS_SET_FREEABLE(x, i) do { } while (0)
+#define STATS_INC_ALLOCHIT(x) do { } while (0)
+#define STATS_INC_ALLOCMISS(x) do { } while (0)
+#define STATS_INC_FREEHIT(x) do { } while (0)
+#define STATS_INC_FREEMISS(x) do { } while (0)
+#endif
+
+#if DEBUG
+
+/*
+ * memory layout of objects:
+ * 0 : objp
+ * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
+ * the end of an object is aligned with the end of the real
+ * allocation. Catches writes behind the end of the allocation.
+ * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
+ * redzone word.
+ * cachep->obj_offset: The real object.
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
+ * [BYTES_PER_WORD long]
+ */
+static int obj_offset(struct kmem_cache *cachep)
+{
+ return cachep->obj_offset;
+}
+
+static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ return (unsigned long long*) (objp + obj_offset(cachep) -
+ sizeof(unsigned long long));
+}
+
+static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ if (cachep->flags & SLAB_STORE_USER)
+ return (unsigned long long *)(objp + cachep->size -
+ sizeof(unsigned long long) -
+ REDZONE_ALIGN);
+ return (unsigned long long *) (objp + cachep->size -
+ sizeof(unsigned long long));
+}
+
+static void **dbg_userword(struct kmem_cache *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_STORE_USER));
+ return (void **)(objp + cachep->size - BYTES_PER_WORD);
+}
+
+#else
+
+#define obj_offset(x) 0
+#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
+#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
+#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
+
+#endif
+
+/*
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
+ */
+#define SLAB_MAX_ORDER_HI 1
+#define SLAB_MAX_ORDER_LO 0
+static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
+
+static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
+ unsigned int idx)
+{
+ return page->s_mem + cache->size * idx;
+}
+
+#define BOOT_CPUCACHE_ENTRIES 1
+/* internal cache of cache description objs */
+static struct kmem_cache kmem_cache_boot = {
+ .batchcount = 1,
+ .limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
+ .size = sizeof(struct kmem_cache),
+ .name = "kmem_cache",
+};
+
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
+
+static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+{
+ return this_cpu_ptr(cachep->cpu_cache);
+}
+
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
+ slab_flags_t flags, size_t *left_over)
+{
+ unsigned int num;
+ size_t slab_size = PAGE_SIZE << gfporder;
+
+ /*
+ * The slab management structure can be either off the slab or
+ * on it. For the latter case, the memory allocated for a
+ * slab is used for:
+ *
+ * - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
+ *
+ * If the slab management structure is off the slab, then the
+ * alignment will already be calculated into the size. Because
+ * the slabs are all pages aligned, the objects will be at the
+ * correct alignment when allocated.
+ */
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
+ } else {
+ num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
+ }
+
+ return num;
+}
+
+#if DEBUG
+#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
+
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+ char *msg)
+{
+ pr_err("slab error in %s(): cache `%s': %s\n",
+ function, cachep->name, msg);
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+}
+#endif
+
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+ */
+
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+ use_alien_caches = 0;
+ return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
+static int __init slab_max_order_setup(char *str)
+{
+ get_option(&str, &slab_max_order);
+ slab_max_order = slab_max_order < 0 ? 0 :
+ min(slab_max_order, MAX_ORDER - 1);
+ slab_max_order_set = true;
+
+ return 1;
+}
+__setup("slab_max_order=", slab_max_order_setup);
+
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
+
+static void init_reap_node(int cpu)
+{
+ per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+ node_online_map);
+}
+
+static void next_reap_node(void)
+{
+ int node = __this_cpu_read(slab_reap_node);
+
+ node = next_node_in(node, node_online_map);
+ __this_cpu_write(slab_reap_node, node);
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
+/*
+ * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
+ * via the workqueue/eventd.
+ * Add the CPU number into the expiration time to minimize the possibility of
+ * the CPUs getting into lockstep and contending for the global cache chain
+ * lock.
+ */
+static void start_cpu_timer(int cpu)
+{
+ struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
+
+ if (reap_work->work.func == NULL) {
+ init_reap_node(cpu);
+ INIT_DEFERRABLE_WORK(reap_work, cache_reap);
+ schedule_delayed_work_on(cpu, reap_work,
+ __round_jiffies_relative(HZ, cpu));
+ }
+}
+
+static void init_arraycache(struct array_cache *ac, int limit, int batch)
+{
+ if (ac) {
+ ac->avail = 0;
+ ac->limit = limit;
+ ac->batchcount = batch;
+ ac->touched = 0;
+ }
+}
+
+static struct array_cache *alloc_arraycache(int node, int entries,
+ int batchcount, gfp_t gfp)
+{
+ size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
+ struct array_cache *ac = NULL;
+
+ ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
+ init_arraycache(ac, entries, batchcount);
+ return ac;
+}
+
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ struct page *page, void *objp)
+{
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
+
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
+
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+}
+
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+ struct array_cache *from, unsigned int max)
+{
+ /* Figure out how many entries to transfer */
+ int nr = min3(from->avail, max, to->limit - to->avail);
+
+ if (!nr)
+ return 0;
+
+ memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ sizeof(void *) *nr);
+
+ from->avail -= nr;
+ to->avail += nr;
+ return nr;
+}
+
+/* &alien->lock must be held by alien callers. */
+static __always_inline void __free_one(struct array_cache *ac, void *objp)
+{
+ /* Avoid trivial double-free. */
+ if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
+ return;
+ ac->entry[ac->avail++] = objp;
+}
+
+#ifndef CONFIG_NUMA
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, n) do { } while (0)
+
+static inline struct alien_cache **alloc_alien_cache(int node,
+ int limit, gfp_t gfp)
+{
+ return NULL;
+}
+
+static inline void free_alien_cache(struct alien_cache **ac_ptr)
+{
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ return 0;
+}
+
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ return NULL;
+}
+
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
+{
+ return NULL;
+}
+
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return flags & ~__GFP_NOFAIL;
+}
+
+#else /* CONFIG_NUMA */
+
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+
+static struct alien_cache *__alloc_alien_cache(int node, int entries,
+ int batch, gfp_t gfp)
+{
+ size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
+ struct alien_cache *alc = NULL;
+
+ alc = kmalloc_node(memsize, gfp, node);
+ if (alc) {
+ kmemleak_no_scan(alc);
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
+ return alc;
+}
+
+static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
+{
+ struct alien_cache **alc_ptr;
+ int i;
+
+ if (limit > 1)
+ limit = 12;
+ alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
+ if (!alc_ptr)
+ return NULL;
+
+ for_each_node(i) {
+ if (i == node || !node_online(i))
+ continue;
+ alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
+ if (!alc_ptr[i]) {
+ for (i--; i >= 0; i--)
+ kfree(alc_ptr[i]);
+ kfree(alc_ptr);
+ return NULL;
+ }
+ }
+ return alc_ptr;
+}
+
+static void free_alien_cache(struct alien_cache **alc_ptr)
+{
+ int i;
+
+ if (!alc_ptr)
+ return;
+ for_each_node(i)
+ kfree(alc_ptr[i]);
+ kfree(alc_ptr);
+}
+
+static void __drain_alien_cache(struct kmem_cache *cachep,
+ struct array_cache *ac, int node,
+ struct list_head *list)
+{
+ struct kmem_cache_node *n = get_node(cachep, node);
+
+ if (ac->avail) {
+ spin_lock(&n->list_lock);
+ /*
+ * Stuff objects into the remote nodes shared array first.
+ * That way we could avoid the overhead of putting the objects
+ * into the free lists and getting them back later.
+ */
+ if (n->shared)
+ transfer_objects(n->shared, ac, ac->limit);
+
+ free_block(cachep, ac->entry, ac->avail, node, list);
+ ac->avail = 0;
+ spin_unlock(&n->list_lock);
+ }
+}
+
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
+{
+ int node = __this_cpu_read(slab_reap_node);
+
+ if (n->alien) {
+ struct alien_cache *alc = n->alien[node];
+ struct array_cache *ac;
+
+ if (alc) {
+ ac = &alc->ac;
+ if (ac->avail && spin_trylock_irq(&alc->lock)) {
+ LIST_HEAD(list);
+
+ __drain_alien_cache(cachep, ac, node, &list);
+ spin_unlock_irq(&alc->lock);
+ slabs_destroy(cachep, &list);
+ }
+ }
+ }
+}
+
+static void drain_alien_cache(struct kmem_cache *cachep,
+ struct alien_cache **alien)
+{
+ int i = 0;
+ struct alien_cache *alc;
+ struct array_cache *ac;
+ unsigned long flags;
+
+ for_each_online_node(i) {
+ alc = alien[i];
+ if (alc) {
+ LIST_HEAD(list);
+
+ ac = &alc->ac;
+ spin_lock_irqsave(&alc->lock, flags);
+ __drain_alien_cache(cachep, ac, i, &list);
+ spin_unlock_irqrestore(&alc->lock, flags);
+ slabs_destroy(cachep, &list);
+ }
+ }
+}
+
+static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
+ int node, int page_node)
+{
+ struct kmem_cache_node *n;
+ struct alien_cache *alien = NULL;
+ struct array_cache *ac;
+ LIST_HEAD(list);
+
+ n = get_node(cachep, node);
+ STATS_INC_NODEFREES(cachep);
+ if (n->alien && n->alien[page_node]) {
+ alien = n->alien[page_node];
+ ac = &alien->ac;
+ spin_lock(&alien->lock);
+ if (unlikely(ac->avail == ac->limit)) {
+ STATS_INC_ACOVERFLOW(cachep);
+ __drain_alien_cache(cachep, ac, page_node, &list);
+ }
+ __free_one(ac, objp);
+ spin_unlock(&alien->lock);
+ slabs_destroy(cachep, &list);
+ } else {
+ n = get_node(cachep, page_node);
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
+ slabs_destroy(cachep, &list);
+ }
+ return 1;
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ int page_node = page_to_nid(virt_to_page(objp));
+ int node = numa_mem_id();
+ /*
+ * Make sure we are not freeing a object from another node to the array
+ * cache on this cpu.
+ */
+ if (likely(node == page_node))
+ return 0;
+
+ return __cache_free_alien(cachep, objp, node, page_node);
+}
+
+/*
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
+}
+#endif
+
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+ struct kmem_cache_node *n;
+
+ /*
+ * Set up the kmem_cache_node for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ n = get_node(cachep, node);
+ if (n) {
+ spin_lock_irq(&n->list_lock);
+ n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+ cachep->num;
+ spin_unlock_irq(&n->list_lock);
+
+ return 0;
+ }
+
+ n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+ if (!n)
+ return -ENOMEM;
+
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+ n->free_limit =
+ (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+ /*
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
+ * protection here.
+ */
+ cachep->node[node] = n;
+
+ return 0;
+}
+
+#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
+/*
+ * Allocates and initializes node for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing node are not replaced if
+ * already in use.
+ *
+ * Must hold slab_mutex.
+ */
+static int init_cache_node_node(int node)
+{
+ int ret;
+ struct kmem_cache *cachep;
+
+ list_for_each_entry(cachep, &slab_caches, list) {
+ ret = init_cache_node(cachep, node, GFP_KERNEL);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+#endif
+
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+ int node, gfp_t gfp, bool force_change)
+{
+ int ret = -ENOMEM;
+ struct kmem_cache_node *n;
+ struct array_cache *old_shared = NULL;
+ struct array_cache *new_shared = NULL;
+ struct alien_cache **new_alien = NULL;
+ LIST_HEAD(list);
+
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
+
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+ if (!new_shared)
+ goto fail;
+ }
+
+ ret = init_cache_node(cachep, node, gfp);
+ if (ret)
+ goto fail;
+
+ n = get_node(cachep, node);
+ spin_lock_irq(&n->list_lock);
+ if (n->shared && force_change) {
+ free_block(cachep, n->shared->entry,
+ n->shared->avail, node, &list);
+ n->shared->avail = 0;
+ }
+
+ if (!n->shared || force_change) {
+ old_shared = n->shared;
+ n->shared = new_shared;
+ new_shared = NULL;
+ }
+
+ if (!n->alien) {
+ n->alien = new_alien;
+ new_alien = NULL;
+ }
+
+ spin_unlock_irq(&n->list_lock);
+ slabs_destroy(cachep, &list);
+
+ /*
+ * To protect lockless access to n->shared during irq disabled context.
+ * If n->shared isn't NULL in irq disabled context, accessing to it is
+ * guaranteed to be valid until irq is re-enabled, because it will be
+ * freed after synchronize_rcu().
+ */
+ if (old_shared && force_change)
+ synchronize_rcu();
+
+fail:
+ kfree(old_shared);
+ kfree(new_shared);
+ free_alien_cache(new_alien);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+
+static void cpuup_canceled(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_cache_node *n = NULL;
+ int node = cpu_to_mem(cpu);
+ const struct cpumask *mask = cpumask_of_node(node);
+
+ list_for_each_entry(cachep, &slab_caches, list) {
+ struct array_cache *nc;
+ struct array_cache *shared;
+ struct alien_cache **alien;
+ LIST_HEAD(list);
+
+ n = get_node(cachep, node);
+ if (!n)
+ continue;
+
+ spin_lock_irq(&n->list_lock);
+
+ /* Free limit for this kmem_cache_node */
+ n->free_limit -= cachep->batchcount;
+
+ /* cpu is dead; no one can alloc from it. */
+ nc = per_cpu_ptr(cachep->cpu_cache, cpu);
+ free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
+
+ if (!cpumask_empty(mask)) {
+ spin_unlock_irq(&n->list_lock);
+ goto free_slab;
+ }
+
+ shared = n->shared;
+ if (shared) {
+ free_block(cachep, shared->entry,
+ shared->avail, node, &list);
+ n->shared = NULL;
+ }
+
+ alien = n->alien;
+ n->alien = NULL;
+
+ spin_unlock_irq(&n->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
+ }
+
+free_slab:
+ slabs_destroy(cachep, &list);
+ }
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &slab_caches, list) {
+ n = get_node(cachep, node);
+ if (!n)
+ continue;
+ drain_freelist(cachep, n, INT_MAX);
+ }
+}
+
+static int cpuup_prepare(long cpu)
+{
+ struct kmem_cache *cachep;
+ int node = cpu_to_mem(cpu);
+ int err;
+
+ /*
+ * We need to do this right in the beginning since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_cache_node and not this cpu's kmem_cache_node
+ */
+ err = init_cache_node_node(node);
+ if (err < 0)
+ goto bad;
+
+ /*
+ * Now we can go ahead with allocating the shared arrays and
+ * array caches
+ */
+ list_for_each_entry(cachep, &slab_caches, list) {
+ err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+ if (err)
+ goto bad;
+ }
+
+ return 0;
+bad:
+ cpuup_canceled(cpu);
+ return -ENOMEM;
+}
+
+int slab_prepare_cpu(unsigned int cpu)
+{
+ int err;
+
+ mutex_lock(&slab_mutex);
+ err = cpuup_prepare(cpu);
+ mutex_unlock(&slab_mutex);
+ return err;
+}
+
+/*
+ * This is called for a failed online attempt and for a successful
+ * offline.
+ *
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
+ * a kmalloc allocation from another cpu for memory from the node of
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
+ * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
+ */
+int slab_dead_cpu(unsigned int cpu)
+{
+ mutex_lock(&slab_mutex);
+ cpuup_canceled(cpu);
+ mutex_unlock(&slab_mutex);
+ return 0;
+}
+#endif
+
+static int slab_online_cpu(unsigned int cpu)
+{
+ start_cpu_timer(cpu);
+ return 0;
+}
+
+static int slab_offline_cpu(unsigned int cpu)
+{
+ /*
+ * Shutdown cache reaper. Note that the slab_mutex is held so
+ * that if cache_reap() is invoked it cannot do anything
+ * expensive but will only modify reap_work and reschedule the
+ * timer.
+ */
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
+ /* Now the cache_reaper is guaranteed to be not running. */
+ per_cpu(slab_reap_work, cpu).work.func = NULL;
+ return 0;
+}
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold slab_mutex.
+ */
+static int __meminit drain_cache_node_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
+
+ list_for_each_entry(cachep, &slab_caches, list) {
+ struct kmem_cache_node *n;
+
+ n = get_node(cachep, node);
+ if (!n)
+ continue;
+
+ drain_freelist(cachep, n, INT_MAX);
+
+ if (!list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&slab_mutex);
+ ret = init_cache_node_node(nid);
+ mutex_unlock(&slab_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&slab_mutex);
+ ret = drain_cache_node_node(nid);
+ mutex_unlock(&slab_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return notifier_from_errno(ret);
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
+/*
+ * swap the static kmem_cache_node with kmalloced memory
+ */
+static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
+ int nodeid)
+{
+ struct kmem_cache_node *ptr;
+
+ ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
+ BUG_ON(!ptr);
+
+ memcpy(ptr, list, sizeof(struct kmem_cache_node));
+ /*
+ * Do not assume that spinlocks can be initialized via memcpy:
+ */
+ spin_lock_init(&ptr->list_lock);
+
+ MAKE_ALL_LISTS(cachep, ptr, nodeid);
+ cachep->node[nodeid] = ptr;
+}
+
+/*
+ * For setting up all the kmem_cache_node for cache whose buffer_size is same as
+ * size of kmem_cache_node.
+ */
+static void __init set_up_node(struct kmem_cache *cachep, int index)
+{
+ int node;
+
+ for_each_online_node(node) {
+ cachep->node[node] = &init_kmem_cache_node[index + node];
+ cachep->node[node]->next_reap = jiffies +
+ REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ }
+}
+
+/*
+ * Initialisation. Called after the page allocator have been initialised and
+ * before smp_init().
+ */
+void __init kmem_cache_init(void)
+{
+ int i;
+
+ kmem_cache = &kmem_cache_boot;
+
+ if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
+ use_alien_caches = 0;
+
+ for (i = 0; i < NUM_INIT_LISTS; i++)
+ kmem_cache_node_init(&init_kmem_cache_node[i]);
+
+ /*
+ * Fragmentation resistance on low memory - only use bigger
+ * page orders on machines with more than 32MB of memory if
+ * not overridden on the command line.
+ */
+ if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
+ slab_max_order = SLAB_MAX_ORDER_HI;
+
+ /* Bootstrap is tricky, because several objects are allocated
+ * from caches that do not exist yet:
+ * 1) initialize the kmem_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except kmem_cache itself:
+ * kmem_cache is statically allocated.
+ * Initially an __init data area is used for the head array and the
+ * kmem_cache_node structures, it's replaced with a kmalloc allocated
+ * array at the end of the bootstrap.
+ * 2) Create the first kmalloc cache.
+ * The struct kmem_cache for the new cache is allocated normally.
+ * An __init data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized
+ * head arrays.
+ * 4) Replace the __init data head arrays for kmem_cache and the first
+ * kmalloc cache with kmalloc allocated arrays.
+ * 5) Replace the __init data for kmem_cache_node for kmem_cache and
+ * the other cache's with kmalloc allocated memory.
+ * 6) Resize the head arrays of the kmalloc caches to their final sizes.
+ */
+
+ /* 1) create the kmem_cache */
+
+ /*
+ * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
+ */
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN, 0, 0);
+ list_add(&kmem_cache->list, &slab_caches);
+ slab_state = PARTIAL;
+
+ /*
+ * Initialize the caches that provide memory for the kmem_cache_node
+ * structures first. Without this, further allocations will bug.
+ */
+ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
+ kmalloc_info[INDEX_NODE].size,
+ ARCH_KMALLOC_FLAGS, 0,
+ kmalloc_info[INDEX_NODE].size);
+ slab_state = PARTIAL_NODE;
+ setup_kmalloc_cache_index_table();
+
+ slab_early_init = 0;
+
+ /* 5) Replace the bootstrap kmem_cache_node */
+ {
+ int nid;
+
+ for_each_online_node(nid) {
+ init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
+
+ init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
+ &init_kmem_cache_node[SIZE_NODE + nid], nid);
+ }
+ }
+
+ create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
+}
+
+void __init kmem_cache_init_late(void)
+{
+ struct kmem_cache *cachep;
+
+ /* 6) resize the head arrays to their final sizes */
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(cachep, &slab_caches, list)
+ if (enable_cpucache(cachep, GFP_NOWAIT))
+ BUG();
+ mutex_unlock(&slab_mutex);
+
+ /* Done! */
+ slab_state = FULL;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * node.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
+ /*
+ * The reap timers are started later, with a module init call: That part
+ * of the kernel is not yet operational.
+ */
+}
+
+static int __init cpucache_init(void)
+{
+ int ret;
+
+ /*
+ * Register the timers that return unneeded pages to the page allocator
+ */
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
+ slab_online_cpu, slab_offline_cpu);
+ WARN_ON(ret < 0);
+
+ return 0;
+}
+__initcall(cpucache_init);
+
+static noinline void
+slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
+{
+#if DEBUG
+ struct kmem_cache_node *n;
+ unsigned long flags;
+ int node;
+ static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
+ return;
+
+ pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nodeid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %d, order: %d\n",
+ cachep->name, cachep->size, cachep->gfporder);
+
+ for_each_kmem_cache_node(cachep, node, n) {
+ unsigned long total_slabs, free_slabs, free_objs;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
+ }
+#endif
+}
+
+/*
+ * Interface to system's page allocator. No need to hold the
+ * kmem_cache_node ->list_lock.
+ *
+ * If we requested dmaable memory, we will get it. Even if we
+ * did not request dmaable memory, we might get it, but that
+ * would be relatively rare and ignorable.
+ */
+static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid)
+{
+ struct page *page;
+
+ flags |= cachep->allocflags;
+
+ page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
+ if (!page) {
+ slab_out_of_memory(cachep, flags, nodeid);
+ return NULL;
+ }
+
+ account_slab_page(page, cachep->gfporder, cachep);
+ __SetPageSlab(page);
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
+ SetPageSlabPfmemalloc(page);
+
+ return page;
+}
+
+/*
+ * Interface to system's page release.
+ */
+static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
+{
+ int order = cachep->gfporder;
+
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+ page_mapcount_reset(page);
+ page->mapping = NULL;
+
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += 1 << order;
+ unaccount_slab_page(page, order, cachep);
+ __free_pages(page, order);
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ page = container_of(head, struct page, rcu_head);
+ cachep = page->slab_cache;
+
+ kmem_freepages(cachep, page);
+}
+
+#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map) {}
+
+#endif
+
+static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
+{
+ int size = cachep->object_size;
+ addr = &((char *)addr)[obj_offset(cachep)];
+
+ memset(addr, val, size);
+ *(unsigned char *)(addr + size - 1) = POISON_END;
+}
+
+static void dump_line(char *data, int offset, int limit)
+{
+ int i;
+ unsigned char error = 0;
+ int bad_count = 0;
+
+ pr_err("%03x: ", offset);
+ for (i = 0; i < limit; i++) {
+ if (data[offset + i] != POISON_FREE) {
+ error = data[offset + i];
+ bad_count++;
+ }
+ }
+ print_hex_dump(KERN_CONT, "", 0, 16, 1,
+ &data[offset], limit, 1);
+
+ if (bad_count == 1) {
+ error ^= POISON_FREE;
+ if (!(error & (error - 1))) {
+ pr_err("Single bit error detected. Probably bad RAM.\n");
+#ifdef CONFIG_X86
+ pr_err("Run memtest86+ or a similar memory test tool.\n");
+#else
+ pr_err("Run a memory test tool.\n");
+#endif
+ }
+ }
+}
+#endif
+
+#if DEBUG
+
+static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
+{
+ int i, size;
+ char *realobj;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ pr_err("Redzone: 0x%llx/0x%llx\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
+ }
+
+ if (cachep->flags & SLAB_STORE_USER)
+ pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
+ realobj = (char *)objp + obj_offset(cachep);
+ size = cachep->object_size;
+ for (i = 0; i < size && lines; i += 16, lines--) {
+ int limit;
+ limit = 16;
+ if (i + limit > size)
+ limit = size - i;
+ dump_line(realobj, i, limit);
+ }
+}
+
+static void check_poison_obj(struct kmem_cache *cachep, void *objp)
+{
+ char *realobj;
+ int size, i;
+ int lines = 0;
+
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
+ realobj = (char *)objp + obj_offset(cachep);
+ size = cachep->object_size;
+
+ for (i = 0; i < size; i++) {
+ char exp = POISON_FREE;
+ if (i == size - 1)
+ exp = POISON_END;
+ if (realobj[i] != exp) {
+ int limit;
+ /* Mismatch ! */
+ /* Print header */
+ if (lines == 0) {
+ pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
+ print_tainted(), cachep->name,
+ realobj, size);
+ print_objinfo(cachep, objp, 0);
+ }
+ /* Hexdump the affected line */
+ i = (i / 16) * 16;
+ limit = 16;
+ if (i + limit > size)
+ limit = size - i;
+ dump_line(realobj, i, limit);
+ i += 16;
+ lines++;
+ /* Limit to 5 lines */
+ if (lines > 5)
+ break;
+ }
+ }
+ if (lines != 0) {
+ /* Print some data about the neighboring objects, if they
+ * exist:
+ */
+ struct page *page = virt_to_head_page(objp);
+ unsigned int objnr;
+
+ objnr = obj_to_index(cachep, page, objp);
+ if (objnr) {
+ objp = index_to_obj(cachep, page, objnr - 1);
+ realobj = (char *)objp + obj_offset(cachep);
+ pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ if (objnr + 1 < cachep->num) {
+ objp = index_to_obj(cachep, page, objnr + 1);
+ realobj = (char *)objp + obj_offset(cachep);
+ pr_err("Next obj: start=%px, len=%d\n", realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ }
+}
+#endif
+
+#if DEBUG
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ void *objp = index_to_obj(cachep, page, i);
+
+ if (cachep->flags & SLAB_POISON) {
+ check_poison_obj(cachep, objp);
+ slab_kernel_map(cachep, objp, 1);
+ }
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "start of a freed object was overwritten");
+ if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "end of a freed object was overwritten");
+ }
+ }
+}
+#else
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+ struct page *page)
+{
+}
+#endif
+
+/**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @page: page pointer being destroyed
+ *
+ * Destroy all the objs in a slab page, and release the mem back to the system.
+ * Before calling the slab page must have been unlinked from the cache. The
+ * kmem_cache_node ->list_lock is not held/needed.
+ */
+static void slab_destroy(struct kmem_cache *cachep, struct page *page)
+{
+ void *freelist;
+
+ freelist = page->freelist;
+ slab_destroy_debugcheck(cachep, page);
+ if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+ call_rcu(&page->rcu_head, kmem_rcu_free);
+ else
+ kmem_freepages(cachep, page);
+
+ /*
+ * From now on, we don't use freelist
+ * although actual page can be freed in rcu context
+ */
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->freelist_cache, freelist);
+}
+
+/*
+ * Update the size of the caches before calling slabs_destroy as it may
+ * recursively call kfree.
+ */
+static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
+{
+ struct page *page, *n;
+
+ list_for_each_entry_safe(page, n, list, slab_list) {
+ list_del(&page->slab_list);
+ slab_destroy(cachep, page);
+ }
+}
+
+/**
+ * calculate_slab_order - calculate size (page order) of slabs
+ * @cachep: pointer to the cache that is being created
+ * @size: size of objects to be created in this cache.
+ * @flags: slab allocation flags
+ *
+ * Also calculates the number of objects per slab.
+ *
+ * This could be made much more intelligent. For now, try to avoid using
+ * high order pages for slabs. When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ *
+ * Return: number of left-over bytes in a slab
+ */
+static size_t calculate_slab_order(struct kmem_cache *cachep,
+ size_t size, slab_flags_t flags)
+{
+ size_t left_over = 0;
+ int gfporder;
+
+ for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
+ unsigned int num;
+ size_t remainder;
+
+ num = cache_estimate(gfporder, size, flags, &remainder);
+ if (!num)
+ continue;
+
+ /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
+ if (num > SLAB_OBJ_MAX_NUM)
+ break;
+
+ if (flags & CFLGS_OFF_SLAB) {
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
+ /*
+ * Needed to avoid possible looping condition
+ * in cache_grow_begin()
+ */
+ if (OFF_SLAB(freelist_cache))
+ continue;
+
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
+ }
+
+ /* Found something acceptable - save it away */
+ cachep->num = num;
+ cachep->gfporder = gfporder;
+ left_over = remainder;
+
+ /*
+ * A VFS-reclaimable slab tends to have most allocations
+ * as GFP_NOFS and we really don't want to have to be allocating
+ * higher-order pages when we are unable to shrink dcache.
+ */
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ break;
+
+ /*
+ * Large number of objects is good, but very large slabs are
+ * currently bad for the gfp()s.
+ */
+ if (gfporder >= slab_max_order)
+ break;
+
+ /*
+ * Acceptable internal fragmentation?
+ */
+ if (left_over * 8 <= (PAGE_SIZE << gfporder))
+ break;
+ }
+ return left_over;
+}
+
+static struct array_cache __percpu *alloc_kmem_cache_cpus(
+ struct kmem_cache *cachep, int entries, int batchcount)
+{
+ int cpu;
+ size_t size;
+ struct array_cache __percpu *cpu_cache;
+
+ size = sizeof(void *) * entries + sizeof(struct array_cache);
+ cpu_cache = __alloc_percpu(size, sizeof(void *));
+
+ if (!cpu_cache)
+ return NULL;
+
+ for_each_possible_cpu(cpu) {
+ init_arraycache(per_cpu_ptr(cpu_cache, cpu),
+ entries, batchcount);
+ }
+
+ return cpu_cache;
+}
+
+static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
+{
+ if (slab_state >= FULL)
+ return enable_cpucache(cachep, gfp);
+
+ cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
+ if (!cachep->cpu_cache)
+ return 1;
+
+ if (slab_state == DOWN) {
+ /* Creation of first cache (kmem_cache). */
+ set_up_node(kmem_cache, CACHE_CACHE);
+ } else if (slab_state == PARTIAL) {
+ /* For kmem_cache_node */
+ set_up_node(cachep, SIZE_NODE);
+ } else {
+ int node;
+
+ for_each_online_node(node) {
+ cachep->node[node] = kmalloc_node(
+ sizeof(struct kmem_cache_node), gfp, node);
+ BUG_ON(!cachep->node[node]);
+ kmem_cache_node_init(cachep->node[node]);
+ }
+ }
+
+ cachep->node[numa_mem_id()]->next_reap =
+ jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+ cpu_cache_get(cachep)->avail = 0;
+ cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+ cpu_cache_get(cachep)->batchcount = 1;
+ cpu_cache_get(cachep)->touched = 0;
+ cachep->batchcount = 1;
+ cachep->limit = BOOT_CPUCACHE_ENTRIES;
+ return 0;
+}
+
+slab_flags_t kmem_cache_flags(unsigned int object_size,
+ slab_flags_t flags, const char *name)
+{
+ return flags;
+}
+
+struct kmem_cache *
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
+ slab_flags_t flags, void (*ctor)(void *))
+{
+ struct kmem_cache *cachep;
+
+ cachep = find_mergeable(size, align, flags, name, ctor);
+ if (cachep) {
+ cachep->refcount++;
+
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ cachep->object_size = max_t(int, cachep->object_size, size);
+ }
+ return cachep;
+}
+
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, slab_flags_t flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * If slab auto-initialization on free is enabled, store the freelist
+ * off-slab, so that its contents don't end up in one of the allocated
+ * objects.
+ */
+ if (unlikely(slab_want_init_on_free(cachep)))
+ return false;
+
+ if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, slab_flags_t flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, slab_flags_t flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+/**
+ * __kmem_cache_create - Create a cache.
+ * @cachep: cache management descriptor
+ * @flags: SLAB flags
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a int, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the created cache or %NULL in case of error
+ */
+int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
+{
+ size_t ralign = BYTES_PER_WORD;
+ gfp_t gfp;
+ int err;
+ unsigned int size = cachep->size;
+
+#if DEBUG
+#if FORCED_DEBUG
+ /*
+ * Enable redzoning and last user accounting, except for caches with
+ * large objects, if the increased size would increase the object size
+ * above the next power of two: caches with object sizes just above a
+ * power of two have a significant amount of internal fragmentation.
+ */
+ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
+ 2 * sizeof(unsigned long long)))
+ flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
+ if (!(flags & SLAB_TYPESAFE_BY_RCU))
+ flags |= SLAB_POISON;
+#endif
+#endif
+
+ /*
+ * Check that size is in terms of words. This is needed to avoid
+ * unaligned accesses for some archs when redzoning is used, and makes
+ * sure any on-slab bufctl's are also correctly aligned.
+ */
+ size = ALIGN(size, BYTES_PER_WORD);
+
+ if (flags & SLAB_RED_ZONE) {
+ ralign = REDZONE_ALIGN;
+ /* If redzoning, ensure that the second redzone is suitably
+ * aligned, by adjusting the object size accordingly. */
+ size = ALIGN(size, REDZONE_ALIGN);
+ }
+
+ /* 3) caller mandated alignment */
+ if (ralign < cachep->align) {
+ ralign = cachep->align;
+ }
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+ /*
+ * 4) Store it.
+ */
+ cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
+
+ if (slab_is_available())
+ gfp = GFP_KERNEL;
+ else
+ gfp = GFP_NOWAIT;
+
+#if DEBUG
+
+ /*
+ * Both debugging options require word-alignment which is calculated
+ * into align above.
+ */
+ if (flags & SLAB_RED_ZONE) {
+ /* add space for red zone words */
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
+ }
+ if (flags & SLAB_STORE_USER) {
+ /* user store requires one word storage behind the end of
+ * the real object. But if the second red zone needs to be
+ * aligned to 64 bits, we must allow that much space.
+ */
+ if (flags & SLAB_RED_ZONE)
+ size += REDZONE_ALIGN;
+ else
+ size += BYTES_PER_WORD;
+ }
+#endif
+
+ kasan_cache_create(cachep, &size, &flags);
+
+ size = ALIGN(size, cachep->align);
+ /*
+ * We should restrict the number of objects in a slab to implement
+ * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+ */
+ if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+ size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
+
+#if DEBUG
+ /*
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
+ */
+ if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
+ }
+#endif
+
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
+
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
+ }
+
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
+ cachep->flags = flags;
+ cachep->allocflags = __GFP_COMP;
+ if (flags & SLAB_CACHE_DMA)
+ cachep->allocflags |= GFP_DMA;
+ if (flags & SLAB_CACHE_DMA32)
+ cachep->allocflags |= GFP_DMA32;
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ cachep->allocflags |= __GFP_RECLAIMABLE;
+ cachep->size = size;
+ cachep->reciprocal_buffer_size = reciprocal_value(size);
+
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
+ }
+
+ err = setup_cpu_cache(cachep, gfp);
+ if (err) {
+ __kmem_cache_release(cachep);
+ return err;
+ }
+
+ return 0;
+}
+
+#if DEBUG
+static void check_irq_off(void)
+{
+ BUG_ON(!irqs_disabled());
+}
+
+static void check_irq_on(void)
+{
+ BUG_ON(irqs_disabled());
+}
+
+static void check_mutex_acquired(void)
+{
+ BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
+static void check_spinlock_acquired(struct kmem_cache *cachep)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+#endif
+}
+
+static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ assert_spin_locked(&get_node(cachep, node)->list_lock);
+#endif
+}
+
+#else
+#define check_irq_off() do { } while(0)
+#define check_irq_on() do { } while(0)
+#define check_mutex_acquired() do { } while(0)
+#define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
+#endif
+
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+ int node, bool free_all, struct list_head *list)
+{
+ int tofree;
+
+ if (!ac || !ac->avail)
+ return;
+
+ tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+ if (tofree > ac->avail)
+ tofree = (ac->avail + 1) / 2;
+
+ free_block(cachep, ac->entry, tofree, node, list);
+ ac->avail -= tofree;
+ memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
+
+static void do_drain(void *arg)
+{
+ struct kmem_cache *cachep = arg;
+ struct array_cache *ac;
+ int node = numa_mem_id();
+ struct kmem_cache_node *n;
+ LIST_HEAD(list);
+
+ check_irq_off();
+ ac = cpu_cache_get(cachep);
+ n = get_node(cachep, node);
+ spin_lock(&n->list_lock);
+ free_block(cachep, ac->entry, ac->avail, node, &list);
+ spin_unlock(&n->list_lock);
+ ac->avail = 0;
+ slabs_destroy(cachep, &list);
+}
+
+static void drain_cpu_caches(struct kmem_cache *cachep)
+{
+ struct kmem_cache_node *n;
+ int node;
+ LIST_HEAD(list);
+
+ on_each_cpu(do_drain, cachep, 1);
+ check_irq_on();
+ for_each_kmem_cache_node(cachep, node, n)
+ if (n->alien)
+ drain_alien_cache(cachep, n->alien);
+
+ for_each_kmem_cache_node(cachep, node, n) {
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, n->shared, node, true, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+ }
+}
+
+/*
+ * Remove slabs from the list of free slabs.
+ * Specify the number of slabs to drain in tofree.
+ *
+ * Returns the actual number of slabs released.
+ */
+static int drain_freelist(struct kmem_cache *cache,
+ struct kmem_cache_node *n, int tofree)
+{
+ struct list_head *p;
+ int nr_freed;
+ struct page *page;
+
+ nr_freed = 0;
+ while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
+
+ spin_lock_irq(&n->list_lock);
+ p = n->slabs_free.prev;
+ if (p == &n->slabs_free) {
+ spin_unlock_irq(&n->list_lock);
+ goto out;
+ }
+
+ page = list_entry(p, struct page, slab_list);
+ list_del(&page->slab_list);
+ n->free_slabs--;
+ n->total_slabs--;
+ /*
+ * Safe to drop the lock. The slab is no longer linked
+ * to the cache.
+ */
+ n->free_objects -= cache->num;
+ spin_unlock_irq(&n->list_lock);
+ slab_destroy(cache, page);
+ nr_freed++;
+ }
+out:
+ return nr_freed;
+}
+
+bool __kmem_cache_empty(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n)
+ if (!list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial))
+ return false;
+ return true;
+}
+
+int __kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret = 0;
+ int node;
+ struct kmem_cache_node *n;
+
+ drain_cpu_caches(cachep);
+
+ check_irq_on();
+ for_each_kmem_cache_node(cachep, node, n) {
+ drain_freelist(cachep, n, INT_MAX);
+
+ ret += !list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial);
+ }
+ return (ret ? 1 : 0);
+}
+
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
+{
+ return __kmem_cache_shrink(cachep);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
+{
+ int i;
+ struct kmem_cache_node *n;
+
+ cache_random_seq_destroy(cachep);
+
+ free_percpu(cachep->cpu_cache);
+
+ /* NUMA: free the node structures */
+ for_each_kmem_cache_node(cachep, i, n) {
+ kfree(n->shared);
+ free_alien_cache(n->alien);
+ kfree(n);
+ cachep->node[i] = NULL;
+ }
+}
+
+/*
+ * Get the memory for a slab management obj.
+ *
+ * For a slab cache when the slab descriptor is off-slab, the
+ * slab descriptor can't come from the same cache which is being created,
+ * Because if it is the case, that means we defer the creation of
+ * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
+ * And we eventually call down to __kmem_cache_create(), which
+ * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * This is a "chicken-and-egg" problem.
+ *
+ * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
+ * which are all initialized during kmem_cache_init().
+ */
+static void *alloc_slabmgmt(struct kmem_cache *cachep,
+ struct page *page, int colour_off,
+ gfp_t local_flags, int nodeid)
+{
+ void *freelist;
+ void *addr = page_address(page);
+
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
+ /* Slab management obj is off-slab. */
+ freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ local_flags, nodeid);
+ } else {
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
+ }
+
+ return freelist;
+}
+
+static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
+{
+ return ((freelist_idx_t *)page->freelist)[idx];
+}
+
+static inline void set_free_obj(struct page *page,
+ unsigned int idx, freelist_idx_t val)
+{
+ ((freelist_idx_t *)(page->freelist))[idx] = val;
+}
+
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
+{
+#if DEBUG
+ int i;
+
+ for (i = 0; i < cachep->num; i++) {
+ void *objp = index_to_obj(cachep, page, i);
+
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = NULL;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ *dbg_redzone1(cachep, objp) = RED_INACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_INACTIVE;
+ }
+ /*
+ * Constructors are not allowed to allocate memory from the same
+ * cache which they are a constructor for. Otherwise, deadlock.
+ * They must also be threaded.
+ */
+ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+ kasan_unpoison_object_data(cachep,
+ objp + obj_offset(cachep));
+ cachep->ctor(objp + obj_offset(cachep));
+ kasan_poison_object_data(
+ cachep, objp + obj_offset(cachep));
+ }
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "constructor overwrote the end of an object");
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "constructor overwrote the start of an object");
+ }
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0);
+ }
+ }
+#endif
+}
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+ struct {
+ unsigned int pos;
+ unsigned int *list;
+ unsigned int count;
+ };
+ struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+ struct kmem_cache *cachep,
+ unsigned int count)
+{
+ bool ret;
+ unsigned int rand;
+
+ /* Use best entropy available to define a random shift */
+ rand = get_random_int();
+
+ /* Use a random state if the pre-computed list is not available */
+ if (!cachep->random_seq) {
+ prandom_seed_state(&state->rnd_state, rand);
+ ret = false;
+ } else {
+ state->list = cachep->random_seq;
+ state->count = count;
+ state->pos = rand % count;
+ ret = true;
+ }
+ return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+ if (state->pos >= state->count)
+ state->pos = 0;
+ return state->list[state->pos++];
+}
+
+/* Swap two freelist entries */
+static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
+{
+ swap(((freelist_idx_t *)page->freelist)[a],
+ ((freelist_idx_t *)page->freelist)[b]);
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+ unsigned int objfreelist = 0, i, rand, count = cachep->num;
+ union freelist_init_state state;
+ bool precomputed;
+
+ if (count < 2)
+ return false;
+
+ precomputed = freelist_state_initialize(&state, cachep, count);
+
+ /* Take a random entry as the objfreelist */
+ if (OBJFREELIST_SLAB(cachep)) {
+ if (!precomputed)
+ objfreelist = count - 1;
+ else
+ objfreelist = next_random_slot(&state);
+ page->freelist = index_to_obj(cachep, page, objfreelist) +
+ obj_offset(cachep);
+ count--;
+ }
+
+ /*
+ * On early boot, generate the list dynamically.
+ * Later use a pre-computed list for speed.
+ */
+ if (!precomputed) {
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, i);
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(&state.rnd_state);
+ rand %= (i + 1);
+ swap_free_obj(page, i, rand);
+ }
+ } else {
+ for (i = 0; i < count; i++)
+ set_free_obj(page, i, next_random_slot(&state));
+ }
+
+ if (OBJFREELIST_SLAB(cachep))
+ set_free_obj(page, cachep->num - 1, objfreelist);
+
+ return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+ struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+ void *objp;
+ bool shuffled;
+
+ cache_init_objs_debug(cachep, page);
+
+ /* Try to randomize the freelist if enabled */
+ shuffled = shuffle_freelist(cachep, page);
+
+ if (!shuffled && OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ objp = index_to_obj(cachep, page, i);
+ objp = kasan_init_slab_obj(cachep, objp);
+
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor) {
+ kasan_unpoison_object_data(cachep, objp);
+ cachep->ctor(objp);
+ kasan_poison_object_data(cachep, objp);
+ }
+
+ if (!shuffled)
+ set_free_obj(page, i, i);
+ }
+}
+
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
+{
+ void *objp;
+
+ objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
+ page->active++;
+
+ return objp;
+}
+
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
+{
+ unsigned int objnr = obj_to_index(cachep, page, objp);
+#if DEBUG
+ unsigned int i;
+
+ /* Verify double free bug */
+ for (i = page->active; i < cachep->num; i++) {
+ if (get_free_obj(page, i) == objnr) {
+ pr_err("slab: double free detected in cache '%s', objp %px\n",
+ cachep->name, objp);
+ BUG();
+ }
+ }
+#endif
+ page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
+ set_free_obj(page, page->active, objnr);
+}
+
+/*
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct page *page,
+ void *freelist)
+{
+ page->slab_cache = cache;
+ page->freelist = freelist;
+}
+
+/*
+ * Grow (by 1) the number of slabs within a cache. This is called by
+ * kmem_cache_alloc() when there are no active objs left in a cache.
+ */
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid)
+{
+ void *freelist;
+ size_t offset;
+ gfp_t local_flags;
+ int page_node;
+ struct kmem_cache_node *n;
+ struct page *page;
+
+ /*
+ * Be lazy and only check for valid flags here, keeping it out of the
+ * critical path in kmem_cache_alloc().
+ */
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+ check_irq_off();
+ if (gfpflags_allow_blocking(local_flags))
+ local_irq_enable();
+
+ /*
+ * Get mem for the objs. Attempt to allocate a physical page from
+ * 'nodeid'.
+ */
+ page = kmem_getpages(cachep, local_flags, nodeid);
+ if (!page)
+ goto failed;
+
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
+
+ /* Get colour for the slab, and cal the next value. */
+ n->colour_next++;
+ if (n->colour_next >= cachep->colour)
+ n->colour_next = 0;
+
+ offset = n->colour_next;
+ if (offset >= cachep->colour)
+ offset = 0;
+
+ offset *= cachep->colour_off;
+
+ /*
+ * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
+ * page_address() in the latter returns a non-tagged pointer,
+ * as it should be for slab pages.
+ */
+ kasan_poison_slab(page);
+
+ /* Get slab management. */
+ freelist = alloc_slabmgmt(cachep, page, offset,
+ local_flags & ~GFP_CONSTRAINT_MASK, page_node);
+ if (OFF_SLAB(cachep) && !freelist)
+ goto opps1;
+
+ slab_map_pages(cachep, page, freelist);
+
+ cache_init_objs(cachep, page);
+
+ if (gfpflags_allow_blocking(local_flags))
+ local_irq_disable();
+
+ return page;
+
+opps1:
+ kmem_freepages(cachep, page);
+failed:
+ if (gfpflags_allow_blocking(local_flags))
+ local_irq_disable();
+ return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+ struct kmem_cache_node *n;
+ void *list = NULL;
+
+ check_irq_off();
+
+ if (!page)
+ return;
+
+ INIT_LIST_HEAD(&page->slab_list);
+ n = get_node(cachep, page_to_nid(page));
+
+ spin_lock(&n->list_lock);
+ n->total_slabs++;
+ if (!page->active) {
+ list_add_tail(&page->slab_list, &n->slabs_free);
+ n->free_slabs++;
+ } else
+ fixup_slab_list(cachep, n, page, &list);
+
+ STATS_INC_GROWN(cachep);
+ n->free_objects += cachep->num - page->active;
+ spin_unlock(&n->list_lock);
+
+ fixup_objfreelist_debug(cachep, &list);
+}
+
+#if DEBUG
+
+/*
+ * Perform extra freeing checks:
+ * - detect bad pointers.
+ * - POISON/RED_ZONE checking
+ */
+static void kfree_debugcheck(const void *objp)
+{
+ if (!virt_addr_valid(objp)) {
+ pr_err("kfree_debugcheck: out of range ptr %lxh\n",
+ (unsigned long)objp);
+ BUG();
+ }
+}
+
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+ unsigned long long redzone1, redzone2;
+
+ redzone1 = *dbg_redzone1(cache, obj);
+ redzone2 = *dbg_redzone2(cache, obj);
+
+ /*
+ * Redzone is ok.
+ */
+ if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+ return;
+
+ if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+ slab_error(cache, "double free detected");
+ else
+ slab_error(cache, "memory outside object was overwritten");
+
+ pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ obj, redzone1, redzone2);
+}
+
+static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
+{
+ unsigned int objnr;
+ struct page *page;
+
+ BUG_ON(virt_to_cache(objp) != cachep);
+
+ objp -= obj_offset(cachep);
+ kfree_debugcheck(objp);
+ page = virt_to_head_page(objp);
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ verify_redzone_free(cachep, objp);
+ *dbg_redzone1(cachep, objp) = RED_INACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_INACTIVE;
+ }
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = (void *)caller;
+
+ objnr = obj_to_index(cachep, page, objp);
+
+ BUG_ON(objnr >= cachep->num);
+ BUG_ON(objp != index_to_obj(cachep, page, objnr));
+
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0);
+ }
+ return objp;
+}
+
+#else
+#define kfree_debugcheck(x) do { } while(0)
+#define cache_free_debugcheck(x,objp,z) (objp)
+#endif
+
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->slab_list);
+ if (page->active == cachep->num) {
+ list_add(&page->slab_list, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->slab_list, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->slab_list);
+ if (!page->active) {
+ list_add_tail(&page->slab_list, &n->slabs_free);
+ n->free_slabs++;
+ } else
+ list_add_tail(&page->slab_list, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, slab_list) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ n->free_touched = 1;
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
+ if (!PageSlabPfmemalloc(page)) {
+ n->free_slabs--;
+ return page;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
+{
+ struct page *page;
+
+ assert_spin_locked(&n->list_lock);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab_list);
+ if (!page) {
+ n->free_touched = 1;
+ page = list_first_entry_or_null(&n->slabs_free, struct page,
+ slab_list);
+ if (page)
+ n->free_slabs--;
+ }
+
+ if (sk_memalloc_socks())
+ page = get_valid_first_slab(n, page, pfmemalloc);
+
+ return page;
+}
+
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+ struct array_cache *ac, struct page *page, int batchcount)
+{
+ /*
+ * There must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(page->active >= cachep->num);
+
+ while (page->active < cachep->num && batchcount--) {
+ STATS_INC_ALLOCED(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ }
+
+ return batchcount;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+{
+ int batchcount;
+ struct kmem_cache_node *n;
+ struct array_cache *ac, *shared;
+ int node;
+ void *list = NULL;
+ struct page *page;
+
+ check_irq_off();
+ node = numa_mem_id();
+
+ ac = cpu_cache_get(cachep);
+ batchcount = ac->batchcount;
+ if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
+ /*
+ * If there was little recent activity on this cache, then
+ * perform only a partial refill. Otherwise we could generate
+ * refill bouncing.
+ */
+ batchcount = BATCHREFILL_LIMIT;
+ }
+ n = get_node(cachep, node);
+
+ BUG_ON(ac->avail > 0 || !n);
+ shared = READ_ONCE(n->shared);
+ if (!n->free_objects && (!shared || !shared->avail))
+ goto direct_grow;
+
+ spin_lock(&n->list_lock);
+ shared = READ_ONCE(n->shared);
+
+ /* See if we can refill from the shared array */
+ if (shared && transfer_objects(ac, shared, batchcount)) {
+ shared->touched = 1;
+ goto alloc_done;
+ }
+
+ while (batchcount > 0) {
+ /* Get slab alloc is to come from. */
+ page = get_first_slab(n, false);
+ if (!page)
+ goto must_grow;
+
+ check_spinlock_acquired(cachep);
+
+ batchcount = alloc_block(cachep, ac, page, batchcount);
+ fixup_slab_list(cachep, n, page, &list);
+ }
+
+must_grow:
+ n->free_objects -= ac->avail;
+alloc_done:
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+direct_grow:
+ if (unlikely(!ac->avail)) {
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
+
+ /*
+ * cache_grow_begin() can reenable interrupts,
+ * then ac could change.
+ */
+ ac = cpu_cache_get(cachep);
+ if (!ac->avail && page)
+ alloc_block(cachep, ac, page, batchcount);
+ cache_grow_end(cachep, page);
+
+ if (!ac->avail)
+ return NULL;
+ }
+ ac->touched = 1;
+
+ return ac->entry[--ac->avail];
+}
+
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ might_sleep_if(gfpflags_allow_blocking(flags));
+}
+
+#if DEBUG
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+ gfp_t flags, void *objp, unsigned long caller)
+{
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
+ if (!objp)
+ return objp;
+ if (cachep->flags & SLAB_POISON) {
+ check_poison_obj(cachep, objp);
+ slab_kernel_map(cachep, objp, 1);
+ poison_obj(cachep, objp, POISON_INUSE);
+ }
+ if (cachep->flags & SLAB_STORE_USER)
+ *dbg_userword(cachep, objp) = (void *)caller;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+ *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+ slab_error(cachep, "double free, or memory outside object was overwritten");
+ pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
+ }
+ *dbg_redzone1(cachep, objp) = RED_ACTIVE;
+ *dbg_redzone2(cachep, objp) = RED_ACTIVE;
+ }
+
+ objp += obj_offset(cachep);
+ if (cachep->ctor && cachep->flags & SLAB_POISON)
+ cachep->ctor(objp);
+ if (ARCH_SLAB_MINALIGN &&
+ ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
+ pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ objp, (int)ARCH_SLAB_MINALIGN);
+ }
+ return objp;
+}
+#else
+#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#endif
+
+static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ void *objp;
+ struct array_cache *ac;
+
+ check_irq_off();
+
+ ac = cpu_cache_get(cachep);
+ if (likely(ac->avail)) {
+ ac->touched = 1;
+ objp = ac->entry[--ac->avail];
+
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
+ }
+
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
+
+out:
+ /*
+ * To avoid a false negative, if an object that is in one of the
+ * per-CPU caches is leaked, we need to make sure kmemleak doesn't
+ * treat the array pointers as a reference to the object.
+ */
+ if (objp)
+ kmemleak_erase(&ac->entry[ac->avail]);
+ return objp;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ int nid_alloc, nid_here;
+
+ if (in_interrupt() || (flags & __GFP_THISNODE))
+ return NULL;
+ nid_alloc = nid_here = numa_mem_id();
+ if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+ nid_alloc = cpuset_slab_spread_node();
+ else if (current->mempolicy)
+ nid_alloc = mempolicy_slab_node();
+ if (nid_alloc != nid_here)
+ return ____cache_alloc_node(cachep, flags, nid_alloc);
+ return NULL;
+}
+
+/*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and fall back is permitted. First we scan all the
+ * available node for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
+ */
+static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(flags);
+ void *obj = NULL;
+ struct page *page;
+ int nid;
+ unsigned int cpuset_mems_cookie;
+
+ if (flags & __GFP_THISNODE)
+ return NULL;
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
+
+retry:
+ /*
+ * Look through allowed nodes for objects available
+ * from existing per node queues.
+ */
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
+ nid = zone_to_nid(zone);
+
+ if (cpuset_zone_allowed(zone, flags) &&
+ get_node(cache, nid) &&
+ get_node(cache, nid)->free_objects) {
+ obj = ____cache_alloc_node(cache,
+ gfp_exact_node(flags), nid);
+ if (obj)
+ break;
+ }
+ }
+
+ if (!obj) {
+ /*
+ * This allocation will be performed within the constraints
+ * of the current cpuset / memory policy requirements.
+ * We may trigger various forms of reclaim on the allowed
+ * set and go into memory reserves if necessary.
+ */
+ page = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, page);
+ if (page) {
+ nid = page_to_nid(page);
+ obj = ____cache_alloc_node(cache,
+ gfp_exact_node(flags), nid);
+
+ /*
+ * Another processor may allocate the objects in
+ * the slab since we are not holding any locks.
+ */
+ if (!obj)
+ goto retry;
+ }
+ }
+
+ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return obj;
+}
+
+/*
+ * A interface to enable slab creation on nodeid
+ */
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid)
+{
+ struct page *page;
+ struct kmem_cache_node *n;
+ void *obj = NULL;
+ void *list = NULL;
+
+ VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
+ n = get_node(cachep, nodeid);
+ BUG_ON(!n);
+
+ check_irq_off();
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, false);
+ if (!page)
+ goto must_grow;
+
+ check_spinlock_acquired_node(cachep, nodeid);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ BUG_ON(page->active == cachep->num);
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+ return obj;
+
+must_grow:
+ spin_unlock(&n->list_lock);
+ page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (page) {
+ /* This slab isn't counted yet so don't update free_objects */
+ obj = slab_get_obj(cachep, page);
+ }
+ cache_grow_end(cachep, page);
+
+ return obj ? obj : fallback_alloc(cachep, flags);
+}
+
+static __always_inline void *
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ unsigned long caller)
+{
+ unsigned long save_flags;
+ void *ptr;
+ int slab_node = numa_mem_id();
+ struct obj_cgroup *objcg = NULL;
+
+ flags &= gfp_allowed_mask;
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
+ if (unlikely(!cachep))
+ return NULL;
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+
+ if (nodeid == NUMA_NO_NODE)
+ nodeid = slab_node;
+
+ if (unlikely(!get_node(cachep, nodeid))) {
+ /* Node not bootstrapped yet */
+ ptr = fallback_alloc(cachep, flags);
+ goto out;
+ }
+
+ if (nodeid == slab_node) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ if (ptr)
+ goto out;
+ }
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+ out:
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+
+ if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
+ memset(ptr, 0, cachep->object_size);
+
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
+ return ptr;
+}
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ void *objp;
+
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
+ objp = alternate_node_alloc(cache, flags);
+ if (objp)
+ goto out;
+ }
+ objp = ____cache_alloc(cache, flags);
+
+ /*
+ * We may just have run out of memory on the local node.
+ * ____cache_alloc_node() knows how to locate memory on other nodes
+ */
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_mem_id());
+
+ out:
+ return objp;
+}
+#else
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return ____cache_alloc(cachep, flags);
+}
+
+#endif /* CONFIG_NUMA */
+
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+{
+ unsigned long save_flags;
+ void *objp;
+ struct obj_cgroup *objcg = NULL;
+
+ flags &= gfp_allowed_mask;
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
+ if (unlikely(!cachep))
+ return NULL;
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ objp = __do_cache_alloc(cachep, flags);
+ local_irq_restore(save_flags);
+ objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+ prefetchw(objp);
+
+ if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
+ memset(objp, 0, cachep->object_size);
+
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+ return objp;
+}
+
+/*
+ * Caller needs to acquire correct kmem_cache_node's list_lock
+ * @list: List of detached free slabs should be freed by caller
+ */
+static void free_block(struct kmem_cache *cachep, void **objpp,
+ int nr_objects, int node, struct list_head *list)
+{
+ int i;
+ struct kmem_cache_node *n = get_node(cachep, node);
+ struct page *page;
+
+ n->free_objects += nr_objects;
+
+ for (i = 0; i < nr_objects; i++) {
+ void *objp;
+ struct page *page;
+
+ objp = objpp[i];
+
+ page = virt_to_head_page(objp);
+ list_del(&page->slab_list);
+ check_spinlock_acquired_node(cachep, node);
+ slab_put_obj(cachep, page, objp);
+ STATS_DEC_ACTIVE(cachep);
+
+ /* fixup slab chains */
+ if (page->active == 0) {
+ list_add(&page->slab_list, &n->slabs_free);
+ n->free_slabs++;
+ } else {
+ /* Unconditionally move a slab to the end of the
+ * partial list on free - maximum time for the
+ * other objects to be freed, too.
+ */
+ list_add_tail(&page->slab_list, &n->slabs_partial);
+ }
+ }
+
+ while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+ n->free_objects -= cachep->num;
+
+ page = list_last_entry(&n->slabs_free, struct page, slab_list);
+ list_move(&page->slab_list, list);
+ n->free_slabs--;
+ n->total_slabs--;
+ }
+}
+
+static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+{
+ int batchcount;
+ struct kmem_cache_node *n;
+ int node = numa_mem_id();
+ LIST_HEAD(list);
+
+ batchcount = ac->batchcount;
+
+ check_irq_off();
+ n = get_node(cachep, node);
+ spin_lock(&n->list_lock);
+ if (n->shared) {
+ struct array_cache *shared_array = n->shared;
+ int max = shared_array->limit - shared_array->avail;
+ if (max) {
+ if (batchcount > max)
+ batchcount = max;
+ memcpy(&(shared_array->entry[shared_array->avail]),
+ ac->entry, sizeof(void *) * batchcount);
+ shared_array->avail += batchcount;
+ goto free_done;
+ }
+ }
+
+ free_block(cachep, ac->entry, batchcount, node, &list);
+free_done:
+#if STATS
+ {
+ int i = 0;
+ struct page *page;
+
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
+ BUG_ON(page->active);
+
+ i++;
+ }
+ STATS_SET_FREEABLE(cachep, i);
+ }
+#endif
+ spin_unlock(&n->list_lock);
+ ac->avail -= batchcount;
+ memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+ slabs_destroy(cachep, &list);
+}
+
+/*
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released. Called with disabled ints.
+ */
+static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
+{
+ /* Put the object into the quarantine, don't touch it for now. */
+ if (kasan_slab_free(cachep, objp, _RET_IP_))
+ return;
+
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(objp, cachep->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
+ ___cache_free(cachep, objp, caller);
+}
+
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
+{
+ struct array_cache *ac = cpu_cache_get(cachep);
+
+ check_irq_off();
+ if (unlikely(slab_want_init_on_free(cachep)))
+ memset(objp, 0, cachep->object_size);
+ kmemleak_free_recursive(objp, cachep->flags);
+ objp = cache_free_debugcheck(cachep, objp, caller);
+ memcg_slab_free_hook(cachep, &objp, 1);
+
+ /*
+ * Skip calling cache_free_alien() when the platform is not numa.
+ * This will avoid cache misses that happen while accessing slabp (which
+ * is per page memory reference) to get nodeid. Instead use a global
+ * variable to skip the call, which is mostly likely to be present in
+ * the cache.
+ */
+ if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
+ return;
+
+ if (ac->avail < ac->limit) {
+ STATS_INC_FREEHIT(cachep);
+ } else {
+ STATS_INC_FREEMISS(cachep);
+ cache_flusharray(cachep, ac);
+ }
+
+ if (sk_memalloc_socks()) {
+ struct page *page = virt_to_head_page(objp);
+
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ cache_free_pfmemalloc(cachep, page, objp);
+ return;
+ }
+ }
+
+ __free_one(ac, objp);
+}
+
+/**
+ * kmem_cache_alloc - Allocate an object
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache. The flags are only relevant
+ * if the cache has no available objects.
+ *
+ * Return: pointer to the new object or %NULL in case of error
+ */
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ void *ret = slab_alloc(cachep, flags, _RET_IP_);
+
+ trace_kmem_cache_alloc(_RET_IP_, ret,
+ cachep->object_size, cachep->size, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
+}
+
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ size_t i;
+ struct obj_cgroup *objcg = NULL;
+
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(slab_want_init_on_alloc(flags, s)))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, objcg, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+#ifdef CONFIG_TRACING
+void *
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
+{
+ void *ret;
+
+ ret = slab_alloc(cachep, flags, _RET_IP_);
+
+ ret = kasan_kmalloc(cachep, ret, size, flags);
+ trace_kmalloc(_RET_IP_, ret,
+ size, cachep->size, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
+
+#ifdef CONFIG_NUMA
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
+ *
+ * Return: pointer to the new object or %NULL in case of error
+ */
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+ void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+
+ trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ cachep->object_size, cachep->size,
+ flags, nodeid);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid,
+ size_t size)
+{
+ void *ret;
+
+ ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+
+ ret = kasan_kmalloc(cachep, ret, size, flags);
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, cachep->size,
+ flags, nodeid);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
+
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+{
+ struct kmem_cache *cachep;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
+ cachep = kmalloc_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
+
+ return ret;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#endif /* CONFIG_NUMA */
+
+/**
+ * __do_kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ * @caller: function caller for debug tracking of the caller
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
+ unsigned long caller)
+{
+ struct kmem_cache *cachep;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
+ cachep = kmalloc_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ ret = slab_alloc(cachep, flags, caller);
+
+ ret = kasan_kmalloc(cachep, ret, size, flags);
+ trace_kmalloc(caller, ret,
+ size, cachep->size, flags);
+
+ return ret;
+}
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc(size, flags, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+{
+ return __do_kmalloc(size, flags, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+ unsigned long flags;
+ cachep = cache_from_obj(cachep, objp);
+ if (!cachep)
+ return;
+
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, _RET_IP_);
+ local_irq_restore(flags);
+
+ trace_kmem_cache_free(_RET_IP_, objp);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+ if (!s)
+ continue;
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+/**
+ * kfree - free previously allocated memory
+ * @objp: pointer returned by kmalloc.
+ *
+ * If @objp is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *objp)
+{
+ struct kmem_cache *c;
+ unsigned long flags;
+
+ trace_kfree(_RET_IP_, objp);
+
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
+ return;
+ local_irq_save(flags);
+ kfree_debugcheck(objp);
+ c = virt_to_cache(objp);
+ if (!c) {
+ local_irq_restore(flags);
+ return;
+ }
+ debug_check_no_locks_freed(objp, c->object_size);
+
+ debug_check_no_obj_freed(objp, c->object_size);
+ __cache_free(c, (void *)objp, _RET_IP_);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(kfree);
+
+/*
+ * This initializes kmem_cache_node or resizes various caches for all nodes.
+ */
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
+{
+ int ret;
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_online_node(node) {
+ ret = setup_kmem_cache_node(cachep, node, gfp, true);
+ if (ret)
+ goto fail;
+
+ }
+
+ return 0;
+
+fail:
+ if (!cachep->list.next) {
+ /* Cache is not active yet. Roll back what we did */
+ node--;
+ while (node >= 0) {
+ n = get_node(cachep, node);
+ if (n) {
+ kfree(n->shared);
+ free_alien_cache(n->alien);
+ kfree(n);
+ cachep->node[node] = NULL;
+ }
+ node--;
+ }
+ }
+ return -ENOMEM;
+}
+
+/* Always called with the slab_mutex held */
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
+{
+ struct array_cache __percpu *cpu_cache, *prev;
+ int cpu;
+
+ cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
+ if (!cpu_cache)
+ return -ENOMEM;
+
+ prev = cachep->cpu_cache;
+ cachep->cpu_cache = cpu_cache;
+ /*
+ * Without a previous cpu_cache there's no need to synchronize remote
+ * cpus, so skip the IPIs.
+ */
+ if (prev)
+ kick_all_cpus_sync();
+
+ check_irq_on();
+ cachep->batchcount = batchcount;
+ cachep->limit = limit;
+ cachep->shared = shared;
+
+ if (!prev)
+ goto setup_node;
+
+ for_each_online_cpu(cpu) {
+ LIST_HEAD(list);
+ int node;
+ struct kmem_cache_node *n;
+ struct array_cache *ac = per_cpu_ptr(prev, cpu);
+
+ node = cpu_to_mem(cpu);
+ n = get_node(cachep, node);
+ spin_lock_irq(&n->list_lock);
+ free_block(cachep, ac->entry, ac->avail, node, &list);
+ spin_unlock_irq(&n->list_lock);
+ slabs_destroy(cachep, &list);
+ }
+ free_percpu(prev);
+
+setup_node:
+ return setup_kmem_cache_nodes(cachep, gfp);
+}
+
+/* Called with slab_mutex held always */
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
+{
+ int err;
+ int limit = 0;
+ int shared = 0;
+ int batchcount = 0;
+
+ err = cache_random_seq_create(cachep, cachep->num, gfp);
+ if (err)
+ goto end;
+
+ if (limit && shared && batchcount)
+ goto skip_setup;
+ /*
+ * The head array serves three purposes:
+ * - create a LIFO ordering, i.e. return objects that are cache-warm
+ * - reduce the number of spinlock operations.
+ * - reduce the number of linked list operations on the slab and
+ * bufctl chains: array operations are cheaper.
+ * The numbers are guessed, we should auto-tune as described by
+ * Bonwick.
+ */
+ if (cachep->size > 131072)
+ limit = 1;
+ else if (cachep->size > PAGE_SIZE)
+ limit = 8;
+ else if (cachep->size > 1024)
+ limit = 24;
+ else if (cachep->size > 256)
+ limit = 54;
+ else
+ limit = 120;
+
+ /*
+ * CPU bound tasks (e.g. network routing) can exhibit cpu bound
+ * allocation behaviour: Most allocs on one cpu, most free operations
+ * on another cpu. For these cases, an efficient object passing between
+ * cpus is necessary. This is provided by a shared array. The array
+ * replaces Bonwick's magazine layer.
+ * On uniprocessor, it's functionally equivalent (but less efficient)
+ * to a larger limit. Thus disabled by default.
+ */
+ shared = 0;
+ if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
+ shared = 8;
+
+#if DEBUG
+ /*
+ * With debugging enabled, large batchcount lead to excessively long
+ * periods with disabled local interrupts. Limit the batchcount
+ */
+ if (limit > 32)
+ limit = 32;
+#endif
+ batchcount = (limit + 1) / 2;
+skip_setup:
+ err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
+ if (err)
+ pr_err("enable_cpucache failed for %s, error %d\n",
+ cachep->name, -err);
+ return err;
+}
+
+/*
+ * Drain an array if it contains any elements taking the node lock only if
+ * necessary. Note that the node listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
+ struct array_cache *ac, int node)
+{
+ LIST_HEAD(list);
+
+ /* ac from n->shared can be freed if we don't hold the slab_mutex. */
+ check_mutex_acquired();
+
+ if (!ac || !ac->avail)
+ return;
+
+ if (ac->touched) {
+ ac->touched = 0;
+ return;
+ }
+
+ spin_lock_irq(&n->list_lock);
+ drain_array_locked(cachep, ac, node, false, &list);
+ spin_unlock_irq(&n->list_lock);
+
+ slabs_destroy(cachep, &list);
+}
+
+/**
+ * cache_reap - Reclaim memory from caches.
+ * @w: work descriptor
+ *
+ * Called from workqueue/eventd every few seconds.
+ * Purpose:
+ * - clear the per-cpu caches for this CPU.
+ * - return freeable pages to the main free memory pool.
+ *
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
+ */
+static void cache_reap(struct work_struct *w)
+{
+ struct kmem_cache *searchp;
+ struct kmem_cache_node *n;
+ int node = numa_mem_id();
+ struct delayed_work *work = to_delayed_work(w);
+
+ if (!mutex_trylock(&slab_mutex))
+ /* Give up. Setup the next iteration. */
+ goto out;
+
+ list_for_each_entry(searchp, &slab_caches, list) {
+ check_irq_on();
+
+ /*
+ * We only take the node lock if absolutely necessary and we
+ * have established with reasonable certainty that
+ * we can do some work if the lock was obtained.
+ */
+ n = get_node(searchp, node);
+
+ reap_alien(searchp, n);
+
+ drain_array(searchp, n, cpu_cache_get(searchp), node);
+
+ /*
+ * These are racy checks but it does not matter
+ * if we skip one check or scan twice.
+ */
+ if (time_after(n->next_reap, jiffies))
+ goto next;
+
+ n->next_reap = jiffies + REAPTIMEOUT_NODE;
+
+ drain_array(searchp, n, n->shared, node);
+
+ if (n->free_touched)
+ n->free_touched = 0;
+ else {
+ int freed;
+
+ freed = drain_freelist(searchp, n, (n->free_limit +
+ 5 * searchp->num - 1) / (5 * searchp->num));
+ STATS_ADD_REAPED(searchp, freed);
+ }
+next:
+ cond_resched();
+ }
+ check_irq_on();
+ mutex_unlock(&slab_mutex);
+ next_reap_node();
+out:
+ /* Set up the next iteration */
+ schedule_delayed_work_on(smp_processor_id(), work,
+ round_jiffies_relative(REAPTIMEOUT_AC));
+}
+
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
+{
+ unsigned long active_objs, num_objs, active_slabs;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(cachep, node, n) {
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
+ free_objs += n->free_objects;
+
+ if (n->shared)
+ shared_avail += n->shared->avail;
+
+ spin_unlock_irq(&n->list_lock);
+ }
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
+ active_objs = num_objs - free_objs;
+
+ sinfo->active_objs = active_objs;
+ sinfo->num_objs = num_objs;
+ sinfo->active_slabs = active_slabs;
+ sinfo->num_slabs = total_slabs;
+ sinfo->shared_avail = shared_avail;
+ sinfo->limit = cachep->limit;
+ sinfo->batchcount = cachep->batchcount;
+ sinfo->shared = cachep->shared;
+ sinfo->objects_per_slab = cachep->num;
+ sinfo->cache_order = cachep->gfporder;
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
+#if STATS
+ { /* node stats */
+ unsigned long high = cachep->high_mark;
+ unsigned long allocs = cachep->num_allocations;
+ unsigned long grown = cachep->grown;
+ unsigned long reaped = cachep->reaped;
+ unsigned long errors = cachep->errors;
+ unsigned long max_freeable = cachep->max_freeable;
+ unsigned long node_allocs = cachep->node_allocs;
+ unsigned long node_frees = cachep->node_frees;
+ unsigned long overflows = cachep->node_overflow;
+
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
+ }
+ /* cpu stats */
+ {
+ unsigned long allochit = atomic_read(&cachep->allochit);
+ unsigned long allocmiss = atomic_read(&cachep->allocmiss);
+ unsigned long freehit = atomic_read(&cachep->freehit);
+ unsigned long freemiss = atomic_read(&cachep->freemiss);
+
+ seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
+ allochit, allocmiss, freehit, freemiss);
+ }
+#endif
+}
+
+#define MAX_SLABINFO_WRITE 128
+/**
+ * slabinfo_write - Tuning for the slab allocator
+ * @file: unused
+ * @buffer: user buffer
+ * @count: data length
+ * @ppos: unused
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
+ int limit, batchcount, shared, res;
+ struct kmem_cache *cachep;
+
+ if (count > MAX_SLABINFO_WRITE)
+ return -EINVAL;
+ if (copy_from_user(&kbuf, buffer, count))
+ return -EFAULT;
+ kbuf[MAX_SLABINFO_WRITE] = '\0';
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp)
+ return -EINVAL;
+ *tmp = '\0';
+ tmp++;
+ if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
+ return -EINVAL;
+
+ /* Find the cache in the chain of caches. */
+ mutex_lock(&slab_mutex);
+ res = -EINVAL;
+ list_for_each_entry(cachep, &slab_caches, list) {
+ if (!strcmp(cachep->name, kbuf)) {
+ if (limit < 1 || batchcount < 1 ||
+ batchcount > limit || shared < 0) {
+ res = 0;
+ } else {
+ res = do_tune_cpucache(cachep, limit,
+ batchcount, shared,
+ GFP_KERNEL);
+ }
+ break;
+ }
+ }
+ mutex_unlock(&slab_mutex);
+ if (res >= 0)
+ res = count;
+ return res;
+}
+
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects incorrectly sized objects and objects that are to be copied
+ * to/from userspace but do not fall entirely within the containing slab
+ * cache's usercopy region.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
+ bool to_user)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ unsigned long offset;
+
+ ptr = kasan_reset_tag(ptr);
+
+ /* Find and validate object. */
+ cachep = page->slab_cache;
+ objnr = obj_to_index(cachep, page, (void *)ptr);
+ BUG_ON(objnr >= cachep->num);
+
+ /* Find offset within object. */
+ offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+ /* Allow address range falling entirely within usercopy region. */
+ if (offset >= cachep->useroffset &&
+ offset - cachep->useroffset <= cachep->usersize &&
+ n <= cachep->useroffset - offset + cachep->usersize)
+ return;
+
+ /*
+ * If the copy is still within the allocated object, produce
+ * a warning instead of rejecting the copy. This is intended
+ * to be a temporary method to find any missing usercopy
+ * whitelists.
+ */
+ if (usercopy_fallback &&
+ offset <= cachep->object_size &&
+ n <= cachep->object_size - offset) {
+ usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
+ return;
+ }
+
+ usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
+/**
+ * __ksize -- Uninstrumented ksize.
+ * @objp: pointer to the object
+ *
+ * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
+ * safety checks as ksize() with KASAN instrumentation enabled.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t __ksize(const void *objp)
+{
+ struct kmem_cache *c;
+ size_t size;
+
+ BUG_ON(!objp);
+ if (unlikely(objp == ZERO_SIZE_PTR))
+ return 0;
+
+ c = virt_to_cache(objp);
+ size = c ? c->object_size : 0;
+
+ return size;
+}
+EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000..6952e10cf
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+ unsigned int object_size;/* The original size of the object */
+ unsigned int size; /* The aligned/padded/added on size */
+ unsigned int align; /* Alignment as calculated */
+ slab_flags_t flags; /* Active flags on the slab */
+ unsigned int useroffset;/* Usercopy region offset */
+ unsigned int usersize; /* Usercopy region size */
+ const char *name; /* Slab name for sysfs */
+ int refcount; /* Use counter */
+ void (*ctor)(void *); /* Called on object slot creation */
+ struct list_head list; /* List of all slab caches on the system */
+};
+
+#endif /* CONFIG_SLOB */
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+
+#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+ DOWN, /* No slab functionality yet */
+ PARTIAL, /* SLUB: kmem_cache_node available */
+ PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
+ UP, /* Slab caches usable but not all extras yet */
+ FULL /* Everything is working */
+};
+
+extern enum slab_state slab_state;
+
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+
+/* The list of all slab caches on the system */
+extern struct list_head slab_caches;
+
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+
+/* A table of kmalloc cache names and sizes */
+extern const struct kmalloc_info_struct {
+ const char *name[NR_KMALLOC_TYPES];
+ unsigned int size;
+} kmalloc_info[];
+
+#ifndef CONFIG_SLOB
+/* Kmalloc array related functions */
+void setup_kmalloc_cache_index_table(void);
+void create_kmalloc_caches(slab_flags_t);
+
+/* Find the kmalloc slab corresponding for a certain size */
+struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+#endif
+
+gfp_t kmalloc_fix_flags(gfp_t flags);
+
+/* Functions provided by the slab allocators */
+int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
+
+struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
+ slab_flags_t flags, unsigned int useroffset,
+ unsigned int usersize);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize);
+
+int slab_unmergeable(struct kmem_cache *s);
+struct kmem_cache *find_mergeable(unsigned size, unsigned align,
+ slab_flags_t flags, const char *name, void (*ctor)(void *));
+#ifndef CONFIG_SLOB
+struct kmem_cache *
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
+ slab_flags_t flags, void (*ctor)(void *));
+
+slab_flags_t kmem_cache_flags(unsigned int object_size,
+ slab_flags_t flags, const char *name);
+#else
+static inline struct kmem_cache *
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
+ slab_flags_t flags, void (*ctor)(void *))
+{ return NULL; }
+
+static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
+ slab_flags_t flags, const char *name)
+{
+ return flags;
+}
+#endif
+
+
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+ SLAB_CACHE_DMA32 | SLAB_PANIC | \
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+ SLAB_ACCOUNT)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT)
+#else
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
+#endif
+
+/* Common flags available with current configuration */
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+
+/* Common flags permitted for kmem_cache_create */
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
+ SLAB_RED_ZONE | \
+ SLAB_POISON | \
+ SLAB_STORE_USER | \
+ SLAB_TRACE | \
+ SLAB_CONSISTENCY_CHECKS | \
+ SLAB_MEM_SPREAD | \
+ SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | \
+ SLAB_ACCOUNT)
+
+bool __kmem_cache_empty(struct kmem_cache *);
+int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);
+void slab_kmem_cache_release(struct kmem_cache *);
+
+struct seq_file;
+struct file;
+
+struct slabinfo {
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs;
+ unsigned long num_slabs;
+ unsigned long shared_avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int shared;
+ unsigned int objects_per_slab;
+ unsigned int cache_order;
+};
+
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos);
+
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the object listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
+static inline int cache_vmstat_idx(struct kmem_cache *s)
+{
+ return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+extern void print_tracking(struct kmem_cache *s, void *object);
+#else
+static inline void print_tracking(struct kmem_cache *s, void *object)
+{
+}
+#endif
+
+/*
+ * Returns true if any of the specified slub_debug flags is enabled for the
+ * cache. Use only for flags parsed by setup_slub_debug() as it also enables
+ * the static key.
+ */
+static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (static_branch_unlikely(&slub_debug_enabled))
+ return s->flags & flags;
+#endif
+ return false;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
+{
+ /*
+ * page->mem_cgroup and page->obj_cgroups are sharing the same
+ * space. To distinguish between them in case we don't know for sure
+ * that the page is a slab page (e.g. page_cgroup_ino()), let's
+ * always set the lowest bit of obj_cgroups.
+ */
+ return (struct obj_cgroup **)
+ ((unsigned long)page->obj_cgroups & ~0x1UL);
+}
+
+static inline bool page_has_obj_cgroups(struct page *page)
+{
+ return ((unsigned long)page->obj_cgroups & 0x1UL);
+}
+
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp);
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
+{
+ kfree(page_obj_cgroups(page));
+ page->obj_cgroups = NULL;
+}
+
+static inline size_t obj_full_size(struct kmem_cache *s)
+{
+ /*
+ * For each accounted object there is an extra space which is used
+ * to store obj_cgroup membership. Charge it too.
+ */
+ return s->size + sizeof(struct obj_cgroup *);
+}
+
+/*
+ * Returns false if the allocation should fail.
+ */
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled())
+ return true;
+
+ if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
+ return true;
+
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return true;
+
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
+ obj_cgroup_put(objcg);
+ return false;
+ }
+
+ *objcgp = objcg;
+ return true;
+}
+
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ int idx, int nr)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ rcu_read_unlock();
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
+{
+ struct page *page;
+ unsigned long off;
+ size_t i;
+
+ if (!memcg_kmem_enabled() || !objcg)
+ return;
+
+ for (i = 0; i < size; i++) {
+ if (likely(p[i])) {
+ page = virt_to_head_page(p[i]);
+
+ if (!page_has_obj_cgroups(page) &&
+ memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ continue;
+ }
+
+ off = obj_to_index(s, page, p[i]);
+ obj_cgroup_get(objcg);
+ page_obj_cgroups(page)[off] = objcg;
+ mod_objcg_state(objcg, page_pgdat(page),
+ cache_vmstat_idx(s), obj_full_size(s));
+ } else {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ }
+ }
+ obj_cgroup_put(objcg);
+}
+
+static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+ void **p, int objects)
+{
+ struct kmem_cache *s;
+ struct obj_cgroup *objcg;
+ struct page *page;
+ unsigned int off;
+ int i;
+
+ if (!memcg_kmem_enabled())
+ return;
+
+ for (i = 0; i < objects; i++) {
+ if (unlikely(!p[i]))
+ continue;
+
+ page = virt_to_head_page(p[i]);
+ if (!page_has_obj_cgroups(page))
+ continue;
+
+ if (!s_orig)
+ s = page->slab_cache;
+ else
+ s = s_orig;
+
+ off = obj_to_index(s, page, p[i]);
+ objcg = page_obj_cgroups(page)[off];
+ if (!objcg)
+ continue;
+
+ page_obj_cgroups(page)[off] = NULL;
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
+ obj_cgroup_put(objcg);
+ }
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static inline bool page_has_obj_cgroups(struct page *page)
+{
+ return false;
+}
+
+static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
+{
+ return NULL;
+}
+
+static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+ struct kmem_cache *s, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
+{
+}
+
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
+{
+ return true;
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
+{
+}
+
+static inline void memcg_slab_free_hook(struct kmem_cache *s,
+ void **p, int objects)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+ struct page *page;
+
+ page = virt_to_head_page(obj);
+ if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
+ __func__))
+ return NULL;
+ return page->slab_cache;
+}
+
+static __always_inline void account_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ PAGE_SIZE << order);
+}
+
+static __always_inline void unaccount_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ if (memcg_kmem_enabled())
+ memcg_free_page_obj_cgroups(page);
+
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(PAGE_SIZE << order));
+}
+
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+ struct kmem_cache *cachep;
+
+ if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
+ return s;
+
+ cachep = virt_to_cache(x);
+ if (WARN(cachep && cachep != s,
+ "%s: Wrong slab cache. %s but object is from %s\n",
+ __func__, s->name, cachep->name))
+ print_tracking(cachep, x);
+ return cachep;
+}
+
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ if (s->flags & SLAB_KASAN)
+ return s->object_size;
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup **objcgp,
+ size_t size, gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
+
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags))
+ return NULL;
+
+ return s;
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ p[i] = kasan_slab_alloc(s, p[i], flags);
+ /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
+ s->flags, flags);
+ }
+
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+}
+
+#ifndef CONFIG_SLOB
+/*
+ * The slab lists for all objects.
+ */
+struct kmem_cache_node {
+ spinlock_t list_lock;
+
+#ifdef CONFIG_SLAB
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
+ unsigned long free_objects;
+ unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
+ struct array_cache *shared; /* shared per node */
+ struct alien_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
+#endif
+
+#ifdef CONFIG_SLUB
+ unsigned long nr_partial;
+ struct list_head partial;
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_t nr_slabs;
+ atomic_long_t total_objects;
+ struct list_head full;
+#endif
+#endif
+
+};
+
+static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+{
+ return s->node[node];
+}
+
+/*
+ * Iterator over all nodes. The body will be executed for each node that has
+ * a kmem_cache_node structure allocated (which is true for all online nodes)
+ */
+#define for_each_kmem_cache_node(__s, __node, __n) \
+ for (__node = 0; __node < nr_node_ids; __node++) \
+ if ((__n = get_node(__s, __node)))
+
+#endif
+
+void *slab_start(struct seq_file *m, loff_t *pos);
+void *slab_next(struct seq_file *m, void *p, loff_t *pos);
+void slab_stop(struct seq_file *m, void *p);
+int memcg_slab_show(struct seq_file *m, void *p);
+
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+void dump_unreclaimable_slab(void);
+#else
+static inline void dump_unreclaimable_slab(void)
+{
+}
+#endif
+
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
+ gfp_t gfp);
+void cache_random_seq_destroy(struct kmem_cache *cachep);
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep,
+ unsigned int count, gfp_t gfp)
+{
+ return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
+{
+ if (static_branch_unlikely(&init_on_alloc)) {
+ if (c->ctor)
+ return false;
+ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
+ return flags & __GFP_ZERO;
+ return true;
+ }
+ return flags & __GFP_ZERO;
+}
+
+static inline bool slab_want_init_on_free(struct kmem_cache *c)
+{
+ if (static_branch_unlikely(&init_on_free))
+ return !(c->ctor ||
+ (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
+ return false;
+}
+
+#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000..ec832904f
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,1197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include <linux/memcontrol.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
+#include "internal.h"
+
+#include "slab.h"
+
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_HARDENED_USERCOPY
+bool usercopy_fallback __ro_after_init =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
+module_param(usercopy_fallback, bool, 0400);
+MODULE_PARM_DESC(usercopy_fallback,
+ "WARN instead of reject usercopy whitelist violations");
+#endif
+
+static LIST_HEAD(slab_caches_to_rcu_destroy);
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
+ slab_caches_to_rcu_destroy_workfn);
+
+/*
+ * Set of flags that will prevent slab merging
+ */
+#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
+ SLAB_FAILSLAB | SLAB_KASAN)
+
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
+ SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
+
+/*
+ * Merge control. If this is set then no merging of slab caches will occur.
+ */
+static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
+
+static int __init setup_slab_nomerge(char *str)
+{
+ slab_nomerge = true;
+ return 1;
+}
+
+#ifdef CONFIG_SLUB
+__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+#endif
+
+__setup("slab_nomerge", setup_slab_nomerge);
+
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+ return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, unsigned int size)
+{
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
+ pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+ return -EINVAL;
+ }
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+ return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
+{
+ return 0;
+}
+#endif
+
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
+}
+
+int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+ void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
+ }
+ }
+ return i;
+}
+
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+static unsigned int calculate_alignment(slab_flags_t flags,
+ unsigned int align, unsigned int size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned int ralign;
+
+ ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+/*
+ * Find a mergeable slab cache
+ */
+int slab_unmergeable(struct kmem_cache *s)
+{
+ if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
+ return 1;
+
+ if (s->ctor)
+ return 1;
+
+ if (s->usersize)
+ return 1;
+
+ /*
+ * We may have set a slab to be unmergeable during bootstrap.
+ */
+ if (s->refcount < 0)
+ return 1;
+
+ return 0;
+}
+
+struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
+ slab_flags_t flags, const char *name, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ if (slab_nomerge)
+ return NULL;
+
+ if (ctor)
+ return NULL;
+
+ size = ALIGN(size, sizeof(void *));
+ align = calculate_alignment(flags, align, size);
+ size = ALIGN(size, align);
+ flags = kmem_cache_flags(size, flags, name);
+
+ if (flags & SLAB_NEVER_MERGE)
+ return NULL;
+
+ list_for_each_entry_reverse(s, &slab_caches, list) {
+ if (slab_unmergeable(s))
+ continue;
+
+ if (size > s->size)
+ continue;
+
+ if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
+ continue;
+ /*
+ * Check if alignment is compatible.
+ * Courtesy of Adrian Drzewiecki
+ */
+ if ((s->size & ~(align - 1)) != s->size)
+ continue;
+
+ if (s->size - size >= sizeof(void *))
+ continue;
+
+ if (IS_ENABLED(CONFIG_SLAB) && align &&
+ (align > s->align || s->align % align))
+ continue;
+
+ return s;
+ }
+ return NULL;
+}
+
+static struct kmem_cache *create_cache(const char *name,
+ unsigned int object_size, unsigned int align,
+ slab_flags_t flags, unsigned int useroffset,
+ unsigned int usersize, void (*ctor)(void *),
+ struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ int err;
+
+ if (WARN_ON(useroffset + usersize > object_size))
+ useroffset = usersize = 0;
+
+ err = -ENOMEM;
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (!s)
+ goto out;
+
+ s->name = name;
+ s->size = s->object_size = object_size;
+ s->align = align;
+ s->ctor = ctor;
+ s->useroffset = useroffset;
+ s->usersize = usersize;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+out:
+ if (err)
+ return ERR_PTR(err);
+ return s;
+
+out_free_cache:
+ kmem_cache_free(kmem_cache, s);
+ goto out;
+}
+
+/**
+ * kmem_cache_create_usercopy - Create a cache with a region suitable
+ * for copying to userspace
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @useroffset: Usercopy region offset
+ * @usersize: Usercopy region size
+ * @ctor: A constructor for the objects.
+ *
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
+struct kmem_cache *
+kmem_cache_create_usercopy(const char *name,
+ unsigned int size, unsigned int align,
+ slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize,
+ void (*ctor)(void *))
+{
+ struct kmem_cache *s = NULL;
+ const char *cache_name;
+ int err;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ err = kmem_cache_sanity_check(name, size);
+ if (err) {
+ goto out_unlock;
+ }
+
+ /* Refuse requests with allocator specific flags */
+ if (flags & ~SLAB_FLAGS_PERMITTED) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Some allocators will constraint the set of valid flags to a subset
+ * of all flags. We expect them to define CACHE_CREATE_MASK in this
+ * case, and we'll just provide them with a sanitized version of the
+ * passed flags.
+ */
+ flags &= CACHE_CREATE_MASK;
+
+ /* Fail closed on bad usersize of useroffset values. */
+ if (WARN_ON(!usersize && useroffset) ||
+ WARN_ON(size < usersize || size - usersize < useroffset))
+ usersize = useroffset = 0;
+
+ if (!usersize)
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
+ if (s)
+ goto out_unlock;
+
+ cache_name = kstrdup_const(name, GFP_KERNEL);
+ if (!cache_name) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ s = create_cache(cache_name, size,
+ calculate_alignment(flags, align, size),
+ flags, useroffset, usersize, ctor, NULL);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ kfree_const(cache_name);
+ }
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ if (err) {
+ if (flags & SLAB_PANIC)
+ panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
+ name, err);
+ else {
+ pr_warn("kmem_cache_create(%s) failed with error %d\n",
+ name, err);
+ dump_stack();
+ }
+ return NULL;
+ }
+ return s;
+}
+EXPORT_SYMBOL(kmem_cache_create_usercopy);
+
+/**
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
+struct kmem_cache *
+kmem_cache_create(const char *name, unsigned int size, unsigned int align,
+ slab_flags_t flags, void (*ctor)(void *))
+{
+ return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
+ ctor);
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
+{
+ LIST_HEAD(to_destroy);
+ struct kmem_cache *s, *s2;
+
+ /*
+ * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
+ * @slab_caches_to_rcu_destroy list. The slab pages are freed
+ * through RCU and the associated kmem_cache are dereferenced
+ * while freeing the pages, so the kmem_caches should be freed only
+ * after the pending RCU operations are finished. As rcu_barrier()
+ * is a pretty slow operation, we batch all pending destructions
+ * asynchronously.
+ */
+ mutex_lock(&slab_mutex);
+ list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
+ mutex_unlock(&slab_mutex);
+
+ if (list_empty(&to_destroy))
+ return;
+
+ rcu_barrier();
+
+ list_for_each_entry_safe(s, s2, &to_destroy, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_release(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
+}
+
+static int shutdown_cache(struct kmem_cache *s)
+{
+ /* free asan quarantined objects */
+ kasan_cache_shutdown(s);
+
+ if (__kmem_cache_shutdown(s) != 0)
+ return -EBUSY;
+
+ list_del(&s->list);
+
+ if (s->flags & SLAB_TYPESAFE_BY_RCU) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
+#endif
+ list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
+ schedule_work(&slab_caches_to_rcu_destroy_work);
+ } else {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
+ sysfs_slab_release(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
+
+ return 0;
+}
+
+void slab_kmem_cache_release(struct kmem_cache *s)
+{
+ __kmem_cache_release(s);
+ kfree_const(s->name);
+ kmem_cache_free(kmem_cache, s);
+}
+
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+ int err;
+
+ if (unlikely(!s))
+ return;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ s->refcount--;
+ if (s->refcount)
+ goto out_unlock;
+
+ err = shutdown_cache(s);
+ if (err) {
+ pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
+ dump_stack();
+ }
+out_unlock:
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ *
+ * Return: %0 if all slabs were released, non-zero otherwise
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+
+ get_online_cpus();
+ get_online_mems();
+ kasan_cache_shrink(cachep);
+ ret = __kmem_cache_shrink(cachep);
+ put_online_mems();
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+bool slab_is_available(void)
+{
+ return slab_state >= UP;
+}
+
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name,
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize)
+{
+ int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
+
+ s->name = name;
+ s->size = s->object_size = size;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
+ s->useroffset = useroffset;
+ s->usersize = usersize;
+
+ err = __kmem_cache_create(s, flags);
+
+ if (err)
+ panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
+ name, size, err);
+
+ s->refcount = -1; /* Exempt from merging for now */
+}
+
+struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ unsigned int size, slab_flags_t flags,
+ unsigned int useroffset, unsigned int usersize)
+{
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ if (!s)
+ panic("Out of memory when creating slab %s\n", name);
+
+ create_boot_cache(s, name, size, flags, useroffset, usersize);
+ list_add(&s->list, &slab_caches);
+ s->refcount = 1;
+ return s;
+}
+
+struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
+EXPORT_SYMBOL(kmalloc_caches);
+
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static inline unsigned int size_index_elem(unsigned int bytes)
+{
+ return (bytes - 1) / 8;
+}
+
+/*
+ * Find the kmem_cache structure that serves a given size of
+ * allocation
+ */
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+{
+ unsigned int index;
+
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ index = size_index[size_index_elem(size)];
+ } else {
+ if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
+ index = fls(size - 1);
+ }
+
+ return kmalloc_caches[kmalloc_type(flags)][index];
+}
+
+#ifdef CONFIG_ZONE_DMA
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \
+ .size = __size, \
+}
+#else
+#define INIT_KMALLOC_INFO(__size, __short_size) \
+{ \
+ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
+ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \
+ .size = __size, \
+}
+#endif
+
+/*
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
+ * kmalloc-67108864.
+ */
+const struct kmalloc_info_struct kmalloc_info[] __initconst = {
+ INIT_KMALLOC_INFO(0, 0),
+ INIT_KMALLOC_INFO(96, 96),
+ INIT_KMALLOC_INFO(192, 192),
+ INIT_KMALLOC_INFO(8, 8),
+ INIT_KMALLOC_INFO(16, 16),
+ INIT_KMALLOC_INFO(32, 32),
+ INIT_KMALLOC_INFO(64, 64),
+ INIT_KMALLOC_INFO(128, 128),
+ INIT_KMALLOC_INFO(256, 256),
+ INIT_KMALLOC_INFO(512, 512),
+ INIT_KMALLOC_INFO(1024, 1k),
+ INIT_KMALLOC_INFO(2048, 2k),
+ INIT_KMALLOC_INFO(4096, 4k),
+ INIT_KMALLOC_INFO(8192, 8k),
+ INIT_KMALLOC_INFO(16384, 16k),
+ INIT_KMALLOC_INFO(32768, 32k),
+ INIT_KMALLOC_INFO(65536, 64k),
+ INIT_KMALLOC_INFO(131072, 128k),
+ INIT_KMALLOC_INFO(262144, 256k),
+ INIT_KMALLOC_INFO(524288, 512k),
+ INIT_KMALLOC_INFO(1048576, 1M),
+ INIT_KMALLOC_INFO(2097152, 2M),
+ INIT_KMALLOC_INFO(4194304, 4M),
+ INIT_KMALLOC_INFO(8388608, 8M),
+ INIT_KMALLOC_INFO(16777216, 16M),
+ INIT_KMALLOC_INFO(33554432, 32M),
+ INIT_KMALLOC_INFO(67108864, 64M)
+};
+
+/*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+void __init setup_kmalloc_cache_index_table(void)
+{
+ unsigned int i;
+
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+ unsigned int elem = size_index_elem(i);
+
+ if (elem >= ARRAY_SIZE(size_index))
+ break;
+ size_index[elem] = KMALLOC_SHIFT_LOW;
+ }
+
+ if (KMALLOC_MIN_SIZE >= 64) {
+ /*
+ * The 96 byte size cache is not used if the alignment
+ * is 64 byte.
+ */
+ for (i = 64 + 8; i <= 96; i += 8)
+ size_index[size_index_elem(i)] = 7;
+
+ }
+
+ if (KMALLOC_MIN_SIZE >= 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[size_index_elem(i)] = 8;
+ }
+}
+
+static void __init
+new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
+{
+ if (type == KMALLOC_RECLAIM)
+ flags |= SLAB_RECLAIM_ACCOUNT;
+
+ kmalloc_caches[type][idx] = create_kmalloc_cache(
+ kmalloc_info[idx].name[type],
+ kmalloc_info[idx].size, flags, 0,
+ kmalloc_info[idx].size);
+}
+
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(slab_flags_t flags)
+{
+ int i;
+ enum kmalloc_cache_type type;
+
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[type][i])
+ new_kmalloc_cache(i, type, flags);
+
+ /*
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
+ */
+ if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
+ !kmalloc_caches[type][1])
+ new_kmalloc_cache(1, type, flags);
+ if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
+ !kmalloc_caches[type][2])
+ new_kmalloc_cache(2, type, flags);
+ }
+ }
+
+ /* Kmalloc array is now usable */
+ slab_state = UP;
+
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
+ struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
+
+ if (s) {
+ kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
+ kmalloc_info[i].name[KMALLOC_DMA],
+ kmalloc_info[i].size,
+ SLAB_CACHE_DMA | flags, 0,
+ kmalloc_info[i].size);
+ }
+ }
+#endif
+}
+#endif /* !CONFIG_SLOB */
+
+gfp_t kmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
+
+ return flags;
+}
+
+/*
+ * To avoid unnecessary overhead, we pass through large allocation requests
+ * directly to the page allocator. We use __GFP_COMP, because we will need to
+ * know the allocation order to free the pages properly in kfree.
+ */
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = NULL;
+ struct page *page;
+
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
+ flags |= __GFP_COMP;
+ page = alloc_pages(flags, order);
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+ }
+ ret = kasan_kmalloc_large(ret, size, flags);
+ /* As ret might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ret, size, 1, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
+#ifdef CONFIG_TRACING
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
+#endif
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Randomize a generic freelist */
+static void freelist_randomize(struct rnd_state *state, unsigned int *list,
+ unsigned int count)
+{
+ unsigned int rand;
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ list[i] = i;
+
+ /* Fisher-Yates shuffle */
+ for (i = count - 1; i > 0; i--) {
+ rand = prandom_u32_state(state);
+ rand %= (i + 1);
+ swap(list[i], list[rand]);
+ }
+}
+
+/* Create a random sequence per cache */
+int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
+ gfp_t gfp)
+{
+ struct rnd_state state;
+
+ if (count < 2 || cachep->random_seq)
+ return 0;
+
+ cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
+ if (!cachep->random_seq)
+ return -ENOMEM;
+
+ /* Get best entropy at this stage of boot */
+ prandom_seed_state(&state, get_random_long());
+
+ freelist_randomize(&state, cachep->random_seq, count);
+ return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+ kfree(cachep->random_seq);
+ cachep->random_seq = NULL;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+#ifdef CONFIG_SLAB
+#define SLABINFO_RIGHTS (0600)
+#else
+#define SLABINFO_RIGHTS (0400)
+#endif
+
+static void print_slabinfo_header(struct seq_file *m)
+{
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+ seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+ seq_putc(m, '\n');
+}
+
+void *slab_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&slab_mutex);
+ return seq_list_start(&slab_caches, *pos);
+}
+
+void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+void slab_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
+static void cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+ struct slabinfo sinfo;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(s, &sinfo);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ s->name, sinfo.active_objs, sinfo.num_objs, s->size,
+ sinfo.objects_per_slab, (1 << sinfo.cache_order));
+
+ seq_printf(m, " : tunables %4u %4u %4u",
+ sinfo.limit, sinfo.batchcount, sinfo.shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+ slabinfo_show_stats(m, s);
+ seq_putc(m, '\n');
+}
+
+static int slab_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+ if (p == slab_caches.next)
+ print_slabinfo_header(m);
+ cache_show(s, m);
+ return 0;
+}
+
+void dump_unreclaimable_slab(void)
+{
+ struct kmem_cache *s, *s2;
+ struct slabinfo sinfo;
+
+ /*
+ * Here acquiring slab_mutex is risky since we don't prefer to get
+ * sleep in oom path. But, without mutex hold, it may introduce a
+ * risk of crash.
+ * Use mutex_trylock to protect the list traverse, dump nothing
+ * without acquiring the mutex.
+ */
+ if (!mutex_trylock(&slab_mutex)) {
+ pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+ return;
+ }
+
+ pr_info("Unreclaimable slab info:\n");
+ pr_info("Name Used Total\n");
+
+ list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ continue;
+
+ get_slabinfo(s, &sinfo);
+
+ if (sinfo.num_objs > 0)
+ pr_info("%-17s %10luKB %10luKB\n", s->name,
+ (sinfo.active_objs * s->size) / 1024,
+ (sinfo.num_objs * s->size) / 1024);
+ }
+ mutex_unlock(&slab_mutex);
+}
+
+#if defined(CONFIG_MEMCG_KMEM)
+int memcg_slab_show(struct seq_file *m, void *p)
+{
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
+ return 0;
+}
+#endif
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+ .start = slab_start,
+ .next = slab_next,
+ .stop = slab_stop,
+ .show = slab_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct proc_ops slabinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = slabinfo_open,
+ .proc_read = seq_read,
+ .proc_write = slabinfo_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
+ return 0;
+}
+module_init(slab_proc_init);
+
+#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
+
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ gfp_t flags)
+{
+ void *ret;
+ size_t ks;
+
+ ks = ksize(p);
+
+ if (ks >= new_size) {
+ p = kasan_krealloc((void *)p, new_size, flags);
+ return (void *)p;
+ }
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+}
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ret = __do_krealloc(p, new_size, flags);
+ if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
+ kfree(p);
+
+ return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
+/**
+ * kfree_sensitive - Clear sensitive information in memory before freeing
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kfree_sensitive() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+void kfree_sensitive(const void *p)
+{
+ size_t ks;
+ void *mem = (void *)p;
+
+ ks = ksize(mem);
+ if (ks)
+ memzero_explicit(mem, ks);
+ kfree(mem);
+}
+EXPORT_SYMBOL(kfree_sensitive);
+
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t ksize(const void *objp)
+{
+ size_t size;
+
+ /*
+ * We need to check that the pointed to object is valid, and only then
+ * unpoison the shadow memory below. We use __kasan_check_read(), to
+ * generate a more useful report at the time ksize() is called (rather
+ * than later where behaviour is undefined due to potential
+ * use-after-free or double-free).
+ *
+ * If the pointed to memory is invalid we return 0, to avoid users of
+ * ksize() writing to and potentially corrupting the memory region.
+ *
+ * We want to perform the check before __ksize(), to avoid potentially
+ * crashing in __ksize() due to accessing invalid metadata.
+ */
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+ return 0;
+
+ size = __ksize(objp);
+ /*
+ * We assume that ksize callers could use whole allocated area,
+ * so we need to unpoison this area.
+ */
+ kasan_unpoison_shadow(objp, size);
+ return size;
+}
+EXPORT_SYMBOL(ksize);
+
+/* Tracepoints definitions. */
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+
+int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
+{
+ if (__should_failslab(s, gfpflags))
+ return -ENOMEM;
+ return 0;
+}
+ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000..7cc9805c8
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * NUMA support by Paul Mundt, 2007.
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is as little as 2 bytes, however typically most architectures
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
+ *
+ * The slob heap is a set of linked list of pages from alloc_pages(),
+ * and within each page, there is a singly-linked list of free blocks
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * alloc_pages() directly, allocating compound pages so the page order
+ * does not have to be separately tracked.
+ * These objects are detected in kfree() because PageSlab()
+ * is false for them.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with the
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
+ * case the low-level allocator will fragment blocks to create the proper
+ * alignment. Again, objects of page-size or greater are allocated by
+ * calling alloc_pages(). As SLAB objects know their size, no separate
+ * size bookkeeping is necessary and there is essentially no allocation
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, __alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
+#include <linux/atomic.h>
+
+#include "slab.h"
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
+
+struct slob_block {
+ slobidx_t units;
+};
+typedef struct slob_block slob_t;
+
+/*
+ * All partially free slob pages go on these lists.
+ */
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
+
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct page *sp)
+{
+ return PageSlobFree(sp);
+}
+
+static void set_slob_page_free(struct page *sp, struct list_head *list)
+{
+ list_add(&sp->slab_list, list);
+ __SetPageSlobFree(sp);
+}
+
+static inline void clear_slob_page_free(struct page *sp)
+{
+ list_del(&sp->slab_list);
+ __ClearPageSlobFree(sp);
+}
+
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
+
+/*
+ * struct slob_rcu is inserted at the tail of allocated slob blocks, which
+ * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
+ * the block using call_rcu.
+ */
+struct slob_rcu {
+ struct rcu_head head;
+ int size;
+};
+
+/*
+ * slob_lock protects all slob allocator structures.
+ */
+static DEFINE_SPINLOCK(slob_lock);
+
+/*
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t offset = next - base;
+
+ if (size > 1) {
+ s[0].units = size;
+ s[1].units = offset;
+ } else
+ s[0].units = -offset;
+}
+
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+ if (s->units > 0)
+ return s->units;
+ return 1;
+}
+
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t next;
+
+ if (s[0].units < 0)
+ next = -s[0].units;
+ else
+ next = s[1].units;
+ return base+next;
+}
+
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+ return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
+
+static void *slob_new_pages(gfp_t gfp, int order, int node)
+{
+ struct page *page;
+
+#ifdef CONFIG_NUMA
+ if (node != NUMA_NO_NODE)
+ page = __alloc_pages_node(node, gfp, order);
+ else
+#endif
+ page = alloc_pages(gfp, order);
+
+ if (!page)
+ return NULL;
+
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+ return page_address(page);
+}
+
+static void slob_free_pages(void *b, int order)
+{
+ struct page *sp = virt_to_page(b);
+
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += 1 << order;
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(sp, order);
+}
+
+/*
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise. If the
+ * allocation fills up @page then the page is removed from the
+ * freelist, in this case @page_removed_from_list will be set to
+ * true (set to false otherwise).
+ */
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ int align_offset, bool *page_removed_from_list)
+{
+ slob_t *prev, *cur, *aligned = NULL;
+ int delta = 0, units = SLOB_UNITS(size);
+
+ *page_removed_from_list = false;
+ for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
+ if (align) {
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
+ delta = aligned - cur;
+ }
+ if (avail >= units + delta) { /* room enough? */
+ slob_t *next;
+
+ if (delta) { /* need to fragment head to align? */
+ next = slob_next(cur);
+ set_slob(aligned, avail - delta, next);
+ set_slob(cur, delta, aligned);
+ prev = cur;
+ cur = aligned;
+ avail = slob_units(cur);
+ }
+
+ next = slob_next(cur);
+ if (avail == units) { /* exact fit? unlink. */
+ if (prev)
+ set_slob(prev, slob_units(prev), next);
+ else
+ sp->freelist = next;
+ } else { /* fragment */
+ if (prev)
+ set_slob(prev, slob_units(prev), cur + units);
+ else
+ sp->freelist = cur + units;
+ set_slob(cur + units, avail - units, next);
+ }
+
+ sp->units -= units;
+ if (!sp->units) {
+ clear_slob_page_free(sp);
+ *page_removed_from_list = true;
+ }
+ return cur;
+ }
+ if (slob_last(cur))
+ return NULL;
+ }
+}
+
+/*
+ * slob_alloc: entry point into the slob allocator.
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
+{
+ struct page *sp;
+ struct list_head *slob_list;
+ slob_t *b = NULL;
+ unsigned long flags;
+ bool _unused;
+
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+
+ spin_lock_irqsave(&slob_lock, flags);
+ /* Iterate through each partially free page, try to find room */
+ list_for_each_entry(sp, slob_list, slab_list) {
+ bool page_removed_from_list = false;
+#ifdef CONFIG_NUMA
+ /*
+ * If there's a node specification, search for a partial
+ * page with a matching node id in the freelist.
+ */
+ if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
+ continue;
+#endif
+ /* Enough room on this page? */
+ if (sp->units < SLOB_UNITS(size))
+ continue;
+
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
+ if (!b)
+ continue;
+
+ /*
+ * If slob_page_alloc() removed sp from the list then we
+ * cannot call list functions on sp. If so allocation
+ * did not fragment the page anyway so optimisation is
+ * unnecessary.
+ */
+ if (!page_removed_from_list) {
+ /*
+ * Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449)
+ */
+ if (!list_is_first(&sp->slab_list, slob_list))
+ list_rotate_to_front(&sp->slab_list, slob_list);
+ }
+ break;
+ }
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ /* Not enough space: must allocate a new page */
+ if (!b) {
+ b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
+ if (!b)
+ return NULL;
+ sp = virt_to_page(b);
+ __SetPageSlab(sp);
+
+ spin_lock_irqsave(&slob_lock, flags);
+ sp->units = SLOB_UNITS(PAGE_SIZE);
+ sp->freelist = b;
+ INIT_LIST_HEAD(&sp->slab_list);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp, slob_list);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+ if (unlikely(gfp & __GFP_ZERO))
+ memset(b, 0, size);
+ return b;
+}
+
+/*
+ * slob_free: entry point into the slob allocator.
+ */
+static void slob_free(void *block, int size)
+{
+ struct page *sp;
+ slob_t *prev, *next, *b = (slob_t *)block;
+ slobidx_t units;
+ unsigned long flags;
+ struct list_head *slob_list;
+
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
+ return;
+ BUG_ON(!size);
+
+ sp = virt_to_page(block);
+ units = SLOB_UNITS(size);
+
+ spin_lock_irqsave(&slob_lock, flags);
+
+ if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
+ /* Go directly to page allocator. Do not pass slob allocator */
+ if (slob_page_free(sp))
+ clear_slob_page_free(sp);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ __ClearPageSlab(sp);
+ page_mapcount_reset(sp);
+ slob_free_pages(b, 0);
+ return;
+ }
+
+ if (!slob_page_free(sp)) {
+ /* This slob page is about to become partially free. Easy! */
+ sp->units = units;
+ sp->freelist = b;
+ set_slob(b, units,
+ (void *)((unsigned long)(b +
+ SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+ set_slob_page_free(sp, slob_list);
+ goto out;
+ }
+
+ /*
+ * Otherwise the page is already partially free, so find reinsertion
+ * point.
+ */
+ sp->units += units;
+
+ if (b < (slob_t *)sp->freelist) {
+ if (b + units == sp->freelist) {
+ units += slob_units(sp->freelist);
+ sp->freelist = slob_next(sp->freelist);
+ }
+ set_slob(b, units, sp->freelist);
+ sp->freelist = b;
+ } else {
+ prev = sp->freelist;
+ next = slob_next(prev);
+ while (b > next) {
+ prev = next;
+ next = slob_next(prev);
+ }
+
+ if (!slob_last(prev) && b + units == next) {
+ units += slob_units(next);
+ set_slob(b, units, slob_next(next));
+ } else
+ set_slob(b, units, next);
+
+ if (prev + slob_units(prev) == b) {
+ units = slob_units(b) + slob_units(prev);
+ set_slob(prev, units, slob_next(b));
+ } else
+ set_slob(prev, slob_units(prev), b);
+ }
+out:
+ spin_unlock_irqrestore(&slob_lock, flags);
+}
+
+/*
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
+ */
+
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
+{
+ unsigned int *m;
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ void *ret;
+
+ gfp &= gfp_allowed_mask;
+
+ fs_reclaim_acquire(gfp);
+ fs_reclaim_release(gfp);
+
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
+
+ if (!m)
+ return NULL;
+ *m = size;
+ ret = (void *)m + minalign;
+
+ trace_kmalloc_node(caller, ret,
+ size, size + minalign, gfp, node);
+ } else {
+ unsigned int order = get_order(size);
+
+ if (likely(order))
+ gfp |= __GFP_COMP;
+ ret = slob_new_pages(gfp, order, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << order, gfp, node);
+ }
+
+ kmemleak_alloc(ret, size, 1, gfp);
+ return ret;
+}
+
+void *__kmalloc(size_t size, gfp_t gfp)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#endif
+
+void kfree(const void *block)
+{
+ struct page *sp;
+
+ trace_kfree(_RET_IP_, block);
+
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
+ return;
+ kmemleak_free(block);
+
+ sp = virt_to_page(block);
+ if (PageSlab(sp)) {
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(sp, order);
+
+ }
+}
+EXPORT_SYMBOL(kfree);
+
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
+size_t __ksize(const void *block)
+{
+ struct page *sp;
+ int align;
+ unsigned int *m;
+
+ BUG_ON(!block);
+ if (unlikely(block == ZERO_SIZE_PTR))
+ return 0;
+
+ sp = virt_to_page(block);
+ if (unlikely(!PageSlab(sp)))
+ return page_size(sp);
+
+ align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
+}
+EXPORT_SYMBOL(__ksize);
+
+int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
+{
+ if (flags & SLAB_TYPESAFE_BY_RCU) {
+ /* leave room for rcu footer at the end of object */
+ c->size += sizeof(struct slob_rcu);
+ }
+ c->flags = flags;
+ return 0;
+}
+
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+{
+ void *b;
+
+ flags &= gfp_allowed_mask;
+
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
+
+ if (c->size < PAGE_SIZE) {
+ b = slob_alloc(c->size, flags, c->align, node, 0);
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ SLOB_UNITS(c->size) * SLOB_UNIT,
+ flags, node);
+ } else {
+ b = slob_new_pages(flags, get_order(c->size), node);
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ PAGE_SIZE << get_order(c->size),
+ flags, node);
+ }
+
+ if (b && c->ctor) {
+ WARN_ON_ONCE(flags & __GFP_ZERO);
+ c->ctor(b);
+ }
+
+ kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
+ return b;
+}
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
+{
+ return slob_alloc_node(cachep, gfp, node);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+
+static void __kmem_cache_free(void *b, int size)
+{
+ if (size < PAGE_SIZE)
+ slob_free(b, size);
+ else
+ slob_free_pages(b, get_order(size));
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+ struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
+ void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
+
+ __kmem_cache_free(b, slob_rcu->size);
+}
+
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+ kmemleak_free_recursive(b, c->flags);
+ if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
+ struct slob_rcu *slob_rcu;
+ slob_rcu = b + (c->size - sizeof(struct slob_rcu));
+ slob_rcu->size = c->size;
+ call_rcu(&slob_rcu->head, kmem_rcu_free);
+ } else {
+ __kmem_cache_free(b, c->size);
+ }
+
+ trace_kmem_cache_free(_RET_IP_, b);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+int __kmem_cache_shutdown(struct kmem_cache *c)
+{
+ /* No way to check for remaining objects */
+ return 0;
+}
+
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
+int __kmem_cache_shrink(struct kmem_cache *d)
+{
+ return 0;
+}
+
+struct kmem_cache kmem_cache_boot = {
+ .name = "kmem_cache",
+ .size = sizeof(struct kmem_cache),
+ .flags = SLAB_PANIC,
+ .align = ARCH_KMALLOC_MINALIGN,
+};
+
+void __init kmem_cache_init(void)
+{
+ kmem_cache = &kmem_cache_boot;
+ slab_state = UP;
+}
+
+void __init kmem_cache_init_late(void)
+{
+ slab_state = FULL;
+}
diff --git a/mm/slub.c b/mm/slub.c
new file mode 100644
index 000000000..b0f637519
--- /dev/null
+++ b/mm/slub.c
@@ -0,0 +1,5771 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SLUB: A slab allocator that limits cache line use instead of queuing
+ * objects in per cpu and per node lists.
+ *
+ * The allocator synchronizes using per slab locks or atomic operatios
+ * and only uses a centralized lock to manage a pool of partial slabs.
+ *
+ * (C) 2007 SGI, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/module.h>
+#include <linux/bit_spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/swab.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "slab.h"
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kasan.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/ctype.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/math64.h>
+#include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
+#include <linux/prefetch.h>
+#include <linux/memcontrol.h>
+#include <linux/random.h>
+
+#include <trace/events/kmem.h>
+
+#include "internal.h"
+
+/*
+ * Lock order:
+ * 1. slab_mutex (Global Mutex)
+ * 2. node->list_lock
+ * 3. slab_lock(page) (Only on some arches and for debugging)
+ *
+ * slab_mutex
+ *
+ * The role of the slab_mutex is to protect the list of all the slabs
+ * and to synchronize major metadata changes to slab cache structures.
+ *
+ * The slab_lock is only used for debugging and on arches that do not
+ * have the ability to do a cmpxchg_double. It only protects:
+ * A. page->freelist -> List of object free in a page
+ * B. page->inuse -> Number of objects in use
+ * C. page->objects -> Number of objects in page
+ * D. page->frozen -> frozen state
+ *
+ * If a slab is frozen then it is exempt from list management. It is not
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
+ *
+ * The list_lock protects the partial and full list on each node and
+ * the partial slab counter. If taken then no new slabs may be added or
+ * removed from the lists nor make the number of partial slabs be modified.
+ * (Note that the total number of slabs is an atomic value that may be
+ * modified without taking the list lock).
+ *
+ * The list_lock is a centralized lock and thus we avoid taking it as
+ * much as possible. As long as SLUB does not have to handle partial
+ * slabs, operations can continue without any centralized lock. F.e.
+ * allocating a long series of objects that fill up slabs does not require
+ * the list lock.
+ * Interrupts are disabled during allocation and deallocation in order to
+ * make the slab allocator safe to use in the context of an irq. In addition
+ * interrupts are disabled to ensure that the processor does not change
+ * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * SLUB assigns one slab for allocation to each processor.
+ * Allocations only occur from these slabs called cpu slabs.
+ *
+ * Slabs with free elements are kept on a partial list and during regular
+ * operations no list for full slabs is used. If an object in a full slab is
+ * freed then the slab will show up again on the partial lists.
+ * We track full slabs for debugging purposes though because otherwise we
+ * cannot scan all objects.
+ *
+ * Slabs are freed when they become empty. Teardown and setup is
+ * minimal so we rely on the page allocators per cpu caches for
+ * fast frees and allocs.
+ *
+ * page->frozen The slab is frozen and exempt from list processing.
+ * This means that the slab is dedicated to a purpose
+ * such as satisfying allocations for a specific
+ * processor. Objects may be freed in the slab while
+ * it is frozen but slab_free will then skip the usual
+ * list operations. It is up to the processor holding
+ * the slab to integrate the slab into the slab lists
+ * when the slab is no longer needed.
+ *
+ * One use of this flag is to mark slabs that are
+ * used for allocations. Then such a slab becomes a cpu
+ * slab. The cpu slab may be equipped with an additional
+ * freelist that allows lockless access to
+ * free objects in addition to the regular freelist
+ * that requires the slab lock.
+ *
+ * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
+ * options set. This moves slab handling out of
+ * the fast path and disables lockless freelists.
+ */
+
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
+}
+
+void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
+ p += s->red_left_pad;
+
+ return p;
+}
+
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ return !kmem_cache_debug(s);
+#else
+ return false;
+#endif
+}
+
+/*
+ * Issues still to be resolved:
+ *
+ * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
+ *
+ * - Variable sizing of the per node arrays
+ */
+
+/* Enable to test recovery from slab corruption on boot */
+#undef SLUB_RESILIENCY_TEST
+
+/* Enable to log cmpxchg failures */
+#undef SLUB_DEBUG_CMPXCHG
+
+/*
+ * Mininum number of partial slabs. These will be left on the partial
+ * lists even if they are empty. kmem_cache_shrink may reclaim them.
+ */
+#define MIN_PARTIAL 5
+
+/*
+ * Maximum number of desirable partial slabs.
+ * The existence of more partial slabs makes kmem_cache_shrink
+ * sort the partial list by the number of objects in use.
+ */
+#define MAX_PARTIAL 10
+
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_STORE_USER)
+
+/*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+ SLAB_TRACE)
+
+
+/*
+ * Debugging flags that require metadata to be stored in the slab. These get
+ * disabled when slub_debug=O is used and a cache's min order increases with
+ * metadata.
+ */
+#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+
+#define OO_SHIFT 16
+#define OO_MASK ((1 << OO_SHIFT) - 1)
+#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
+
+/* Internal SLUB flags */
+/* Poison object */
+#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
+/* Use cmpxchg_double */
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+
+/*
+ * Tracking user of a slab.
+ */
+#define TRACK_ADDRS_COUNT 16
+struct track {
+ unsigned long addr; /* Called from address */
+#ifdef CONFIG_STACKTRACE
+ unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
+ int cpu; /* Was running on cpu */
+ int pid; /* Pid context */
+ unsigned long when; /* When did the operation occur */
+};
+
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+
+#ifdef CONFIG_SYSFS
+static int sysfs_slab_add(struct kmem_cache *);
+static int sysfs_slab_alias(struct kmem_cache *, const char *);
+#else
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+ { return 0; }
+#endif
+
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
+{
+#ifdef CONFIG_SLUB_STATS
+ /*
+ * The rmw is racy on a preemptible kernel but this is acceptable, so
+ * avoid this_cpu_add()'s irq-disable overhead.
+ */
+ raw_cpu_inc(s->cpu_slab->stat[si]);
+#endif
+}
+
+/********************************************************************
+ * Core slab cache functions
+ *******************************************************************/
+
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+ unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ /*
+ * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * Normally, this doesn't cause any issues, as both set_freepointer()
+ * and get_freepointer() are called with a pointer with the same tag.
+ * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+ * example, when __free_slub() iterates over objects in a cache, it
+ * passes untagged pointers to check_object(). check_object() in turns
+ * calls get_freepointer() with an untagged pointer, which causes the
+ * freepointer to be restored incorrectly.
+ */
+ return (void *)((unsigned long)ptr ^ s->random ^
+ swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
+#else
+ return ptr;
+#endif
+}
+
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+ void *ptr_addr)
+{
+ return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+ (unsigned long)ptr_addr);
+}
+
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+ return freelist_dereference(s, object + s->offset);
+}
+
+static void prefetch_freepointer(const struct kmem_cache *s, void *object)
+{
+ prefetch(object + s->offset);
+}
+
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+ unsigned long freepointer_addr;
+ void *p;
+
+ if (!debug_pagealloc_enabled_static())
+ return get_freepointer(s, object);
+
+ freepointer_addr = (unsigned long)object + s->offset;
+ copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
+ return freelist_ptr(s, p, freepointer_addr);
+}
+
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+ unsigned long freeptr_addr = (unsigned long)object + s->offset;
+
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ BUG_ON(object == fp); /* naive detection of double free or corruption */
+#endif
+
+ *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
+}
+
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr, __objects) \
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
+
+static inline unsigned int order_objects(unsigned int order, unsigned int size)
+{
+ return ((unsigned int)PAGE_SIZE << order) / size;
+}
+
+static inline struct kmem_cache_order_objects oo_make(unsigned int order,
+ unsigned int size)
+{
+ struct kmem_cache_order_objects x = {
+ (order << OO_SHIFT) + order_objects(order, size)
+ };
+
+ return x;
+}
+
+static inline unsigned int oo_order(struct kmem_cache_order_objects x)
+{
+ return x.x >> OO_SHIFT;
+}
+
+static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
+{
+ return x.x & OO_MASK;
+}
+
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ bit_spin_lock(PG_locked, &page->flags);
+}
+
+static __always_inline void slab_unlock(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ __bit_spin_unlock(PG_locked, &page->flags);
+}
+
+/* Interrupts must be disabled (for the fallback code to work right) */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+ VM_BUG_ON(!irqs_disabled());
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist, &page->counters,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return true;
+ } else
+#endif
+ {
+ slab_lock(page);
+ if (page->freelist == freelist_old &&
+ page->counters == counters_old) {
+ page->freelist = freelist_new;
+ page->counters = counters_new;
+ slab_unlock(page);
+ return true;
+ }
+ slab_unlock(page);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return false;
+}
+
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist, &page->counters,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return true;
+ } else
+#endif
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ slab_lock(page);
+ if (page->freelist == freelist_old &&
+ page->counters == counters_old) {
+ page->freelist = freelist_new;
+ page->counters = counters_new;
+ slab_unlock(page);
+ local_irq_restore(flags);
+ return true;
+ }
+ slab_unlock(page);
+ local_irq_restore(flags);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return false;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_SPINLOCK(object_map_lock);
+
+/*
+ * Determine a map of object in use on a page.
+ *
+ * Node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+ __acquires(&object_map_lock)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ VM_BUG_ON(!irqs_disabled());
+
+ spin_lock(&object_map_lock);
+
+ bitmap_zero(object_map, page->objects);
+
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(__obj_to_index(s, addr, p), object_map);
+
+ return object_map;
+}
+
+static void put_map(unsigned long *map) __releases(&object_map_lock)
+{
+ VM_BUG_ON(map != object_map);
+ spin_unlock(&object_map_lock);
+}
+
+static inline unsigned int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
+/*
+ * Debug settings:
+ */
+#if defined(CONFIG_SLUB_DEBUG_ON)
+static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
+static slab_flags_t slub_debug;
+#endif
+
+static char *slub_debug_string;
+static int disable_higher_order_debug;
+
+/*
+ * slub is about to manipulate internal object metadata. This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error. metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+ kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+ kasan_enable_current();
+}
+
+/*
+ * Object debugging
+ */
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = kasan_reset_tag(object);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static void print_section(char *level, char *text, u8 *addr,
+ unsigned int length)
+{
+ metadata_access_enable();
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ length, 1);
+ metadata_access_disable();
+}
+
+/*
+ * See comment in calculate_sizes().
+ */
+static inline bool freeptr_outside_object(struct kmem_cache *s)
+{
+ return s->offset >= s->inuse;
+}
+
+/*
+ * Return offset of the end of info block which is inuse + free pointer if
+ * not overlapping with object.
+ */
+static inline unsigned int get_info_end(struct kmem_cache *s)
+{
+ if (freeptr_outside_object(s))
+ return s->inuse + sizeof(void *);
+ else
+ return s->inuse;
+}
+
+static struct track *get_track(struct kmem_cache *s, void *object,
+ enum track_item alloc)
+{
+ struct track *p;
+
+ p = object + get_info_end(s);
+
+ return p + alloc;
+}
+
+static void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr)
+{
+ struct track *p = get_track(s, object, alloc);
+
+ if (addr) {
+#ifdef CONFIG_STACKTRACE
+ unsigned int nr_entries;
+
+ metadata_access_enable();
+ nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
+ metadata_access_disable();
+
+ if (nr_entries < TRACK_ADDRS_COUNT)
+ p->addrs[nr_entries] = 0;
+#endif
+ p->addr = addr;
+ p->cpu = smp_processor_id();
+ p->pid = current->pid;
+ p->when = jiffies;
+ } else {
+ memset(p, 0, sizeof(struct track));
+ }
+}
+
+static void init_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, 0UL);
+ set_track(s, object, TRACK_ALLOC, 0UL);
+}
+
+static void print_track(const char *s, struct track *t, unsigned long pr_time)
+{
+ if (!t->addr)
+ return;
+
+ pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+ {
+ int i;
+ for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+ if (t->addrs[i])
+ pr_err("\t%pS\n", (void *)t->addrs[i]);
+ else
+ break;
+ }
+#endif
+}
+
+void print_tracking(struct kmem_cache *s, void *object)
+{
+ unsigned long pr_time = jiffies;
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
+ print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
+}
+
+static void print_page_info(struct page *page)
+{
+ pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
+
+}
+
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("=============================================================================\n");
+ pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("-----------------------------------------------------------------------------\n\n");
+
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ va_end(args);
+}
+
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("FIX %s: %pV\n", s->name, &vaf);
+ va_end(args);
+}
+
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree) && freelist) {
+ object_err(s, page, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
+static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned int off; /* Offset of last byte */
+ u8 *addr = page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (s->flags & SLAB_RED_ZONE)
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ s->red_left_pad);
+ else if (p > addr + 16)
+ print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
+
+ print_section(KERN_ERR, "Object ", p,
+ min_t(unsigned int, s->object_size, PAGE_SIZE));
+ if (s->flags & SLAB_RED_ZONE)
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ s->inuse - s->object_size);
+
+ off = get_info_end(s);
+
+ if (s->flags & SLAB_STORE_USER)
+ off += 2 * sizeof(struct track);
+
+ off += kasan_metadata_size(s);
+
+ if (off != size_from_object(s))
+ /* Beginning of the filler is the free pointer */
+ print_section(KERN_ERR, "Padding ", p + off,
+ size_from_object(s) - off);
+
+ dump_stack();
+}
+
+void object_err(struct kmem_cache *s, struct page *page,
+ u8 *object, char *reason)
+{
+ slab_bug(s, "%s", reason);
+ print_trailer(s, page, object);
+}
+
+static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
+ const char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ slab_bug(s, "%s", buf);
+ print_page_info(page);
+ dump_stack();
+}
+
+static void init_object(struct kmem_cache *s, void *object, u8 val)
+{
+ u8 *p = object;
+
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
+ if (s->flags & __OBJECT_POISON) {
+ memset(p, POISON_FREE, s->object_size - 1);
+ p[s->object_size - 1] = POISON_END;
+ }
+
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p + s->object_size, val, s->inuse - s->object_size);
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+ u8 *object, char *what,
+ u8 *start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+ u8 *addr = page_address(page);
+
+ metadata_access_enable();
+ fault = memchr_inv(start, value, bytes);
+ metadata_access_disable();
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault - addr,
+ fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
+}
+
+/*
+ * Object layout:
+ *
+ * object address
+ * Bytes of the object to be managed.
+ * If the freepointer may overlay the object then the free
+ * pointer is at the middle of the object.
+ *
+ * Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ * 0xa5 (POISON_END)
+ *
+ * object + s->object_size
+ * Padding to reach word boundary. This is also used for Redzoning.
+ * Padding is extended by another word if Redzoning is enabled and
+ * object_size == inuse.
+ *
+ * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
+ * 0xcc (RED_ACTIVE) for objects in use.
+ *
+ * object + s->inuse
+ * Meta data starts here.
+ *
+ * A. Free pointer (if we cannot overwrite object on free)
+ * B. Tracking data for SLAB_STORE_USER
+ * C. Padding to reach required alignment boundary or at mininum
+ * one word if debugging is on to be able to detect writes
+ * before the word boundary.
+ *
+ * Padding is done using 0x5a (POISON_INUSE)
+ *
+ * object + s->size
+ * Nothing is used beyond s->size.
+ *
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
+ * ignored. And therefore no slab options that rely on these boundaries
+ * may be used with merged slabcaches.
+ */
+
+static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned long off = get_info_end(s); /* The end of info */
+
+ if (s->flags & SLAB_STORE_USER)
+ /* We also have user information there */
+ off += 2 * sizeof(struct track);
+
+ off += kasan_metadata_size(s);
+
+ if (size_from_object(s) == off)
+ return 1;
+
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, size_from_object(s) - off);
+}
+
+/* Check the pad bytes at the end of a slab page */
+static int slab_pad_check(struct kmem_cache *s, struct page *page)
+{
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ u8 *pad;
+ int length;
+ int remainder;
+
+ if (!(s->flags & SLAB_POISON))
+ return 1;
+
+ start = page_address(page);
+ length = page_size(page);
+ end = start + length;
+ remainder = length % s->size;
+ if (!remainder)
+ return 1;
+
+ pad = end - remainder;
+ metadata_access_enable();
+ fault = memchr_inv(pad, POISON_INUSE, remainder);
+ metadata_access_disable();
+ if (!fault)
+ return 1;
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
+ print_section(KERN_ERR, "Padding ", pad, remainder);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
+ return 0;
+}
+
+static int check_object(struct kmem_cache *s, struct page *page,
+ void *object, u8 val)
+{
+ u8 *p = object;
+ u8 *endobject = object + s->object_size;
+
+ if (s->flags & SLAB_RED_ZONE) {
+ if (!check_bytes_and_report(s, page, object, "Left Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Right Redzone",
+ endobject, val, s->inuse - s->object_size))
+ return 0;
+ } else {
+ if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
+ check_bytes_and_report(s, page, p, "Alignment padding",
+ endobject, POISON_INUSE,
+ s->inuse - s->object_size);
+ }
+ }
+
+ if (s->flags & SLAB_POISON) {
+ if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
+ (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->object_size - 1) ||
+ !check_bytes_and_report(s, page, p, "End Poison",
+ p + s->object_size - 1, POISON_END, 1)))
+ return 0;
+ /*
+ * check_pad_bytes cleans up on its own.
+ */
+ check_pad_bytes(s, page, p);
+ }
+
+ if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
+ /*
+ * Object and freepointer overlap. Cannot check
+ * freepointer while object is allocated.
+ */
+ return 1;
+
+ /* Check free pointer validity */
+ if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
+ object_err(s, page, p, "Freepointer corrupt");
+ /*
+ * No choice but to zap it and thus lose the remainder
+ * of the free objects in this slab. May cause
+ * another error because the object count is now wrong.
+ */
+ set_freepointer(s, p, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+static int check_slab(struct kmem_cache *s, struct page *page)
+{
+ int maxobj;
+
+ VM_BUG_ON(!irqs_disabled());
+
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Not a valid slab page");
+ return 0;
+ }
+
+ maxobj = order_objects(compound_order(page), s->size);
+ if (page->objects > maxobj) {
+ slab_err(s, page, "objects %u > max %u",
+ page->objects, maxobj);
+ return 0;
+ }
+ if (page->inuse > page->objects) {
+ slab_err(s, page, "inuse %u > max %u",
+ page->inuse, page->objects);
+ return 0;
+ }
+ /* Slab_pad_check fixes things up after itself */
+ slab_pad_check(s, page);
+ return 1;
+}
+
+/*
+ * Determine if a certain object on a page is on the freelist. Must hold the
+ * slab lock to guarantee that the chains are in a consistent state.
+ */
+static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
+{
+ int nr = 0;
+ void *fp;
+ void *object = NULL;
+ int max_objects;
+
+ fp = page->freelist;
+ while (fp && nr <= page->objects) {
+ if (fp == search)
+ return 1;
+ if (!check_valid_pointer(s, page, fp)) {
+ if (object) {
+ object_err(s, page, object,
+ "Freechain corrupt");
+ set_freepointer(s, object, NULL);
+ } else {
+ slab_err(s, page, "Freepointer corrupt");
+ page->freelist = NULL;
+ page->inuse = page->objects;
+ slab_fix(s, "Freelist cleared");
+ return 0;
+ }
+ break;
+ }
+ object = fp;
+ fp = get_freepointer(s, object);
+ nr++;
+ }
+
+ max_objects = order_objects(compound_order(page), s->size);
+ if (max_objects > MAX_OBJS_PER_PAGE)
+ max_objects = MAX_OBJS_PER_PAGE;
+
+ if (page->objects != max_objects) {
+ slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+ page->objects, max_objects);
+ page->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted.");
+ }
+ if (page->inuse != page->objects - nr) {
+ slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+ page->inuse, page->objects - nr);
+ page->inuse = page->objects - nr;
+ slab_fix(s, "Object count adjusted.");
+ }
+ return search == NULL;
+}
+
+static void trace(struct kmem_cache *s, struct page *page, void *object,
+ int alloc)
+{
+ if (s->flags & SLAB_TRACE) {
+ pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ s->name,
+ alloc ? "alloc" : "free",
+ object, page->inuse,
+ page->freelist);
+
+ if (!alloc)
+ print_section(KERN_INFO, "Object ", (void *)object,
+ s->object_size);
+
+ dump_stack();
+ }
+}
+
+/*
+ * Tracking of fully allocated slabs for debugging purposes.
+ */
+static void add_full(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ lockdep_assert_held(&n->list_lock);
+ list_add(&page->slab_list, &n->full);
+}
+
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ lockdep_assert_held(&n->list_lock);
+ list_del(&page->slab_list);
+}
+
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
+{
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (likely(n)) {
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
+ }
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+ atomic_long_sub(objects, &n->total_objects);
+}
+
+/* Object debug checks for alloc/free paths */
+static void setup_object_debug(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
+ return;
+
+ init_object(s, object, SLUB_RED_INACTIVE);
+ init_tracking(s, object);
+}
+
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
+{
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
+ return;
+
+ metadata_access_enable();
+ memset(addr, POISON_INUSE, page_size(page));
+ metadata_access_disable();
+}
+
+static inline int alloc_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ if (!check_slab(s, page))
+ return 0;
+
+ if (!check_valid_pointer(s, page, object)) {
+ object_err(s, page, object, "Freelist Pointer check fails");
+ return 0;
+ }
+
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
+ return 0;
+
+ return 1;
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!alloc_consistency_checks(s, page, object))
+ goto bad;
+ }
+
+ /* Success perform special debug activities for allocs */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_ALLOC, addr);
+ trace(s, page, object, 1);
+ init_object(s, object, SLUB_RED_ACTIVE);
+ return 1;
+
+bad:
+ if (PageSlab(page)) {
+ /*
+ * If this is a slab page then lets do the best we can
+ * to avoid issues in the future. Marking all objects
+ * as used avoids touching the remaining objects.
+ */
+ slab_fix(s, "Marking all objects used");
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ }
+ return 0;
+}
+
+static inline int free_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
+{
+ if (!check_valid_pointer(s, page, object)) {
+ slab_err(s, page, "Invalid object pointer 0x%p", object);
+ return 0;
+ }
+
+ if (on_freelist(s, page, object)) {
+ object_err(s, page, object, "Object already free");
+ return 0;
+ }
+
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
+ return 0;
+
+ if (unlikely(s != page->slab_cache)) {
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ object);
+ } else if (!page->slab_cache) {
+ pr_err("SLUB <none>: no slab for object 0x%p.\n",
+ object);
+ dump_stack();
+ } else
+ object_err(s, page, object,
+ "page slab pointer corrupt.");
+ return 0;
+ }
+ return 1;
+}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(page);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, page))
+ goto out;
+ }
+
+next_object:
+ cnt++;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, page, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_FREE, addr);
+ trace(s, page, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ ret = 1;
+
+out:
+ if (cnt != bulk_cnt)
+ slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+ bulk_cnt, cnt);
+
+ slab_unlock(page);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return ret;
+}
+
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str: start of block
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs: return start of list of slabs, or NULL when there's no list
+ * @init: assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
+{
+ bool higher_order_disable = false;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (*str == ',') {
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ *flags = DEBUG_DEFAULT_FLAGS;
+ goto check_slabs;
+ }
+ *flags = 0;
+
+ /* Determine which debug features should be switched on */
+ for (; *str && *str != ',' && *str != ';'; str++) {
+ switch (tolower(*str)) {
+ case '-':
+ *flags = 0;
+ break;
+ case 'f':
+ *flags |= SLAB_CONSISTENCY_CHECKS;
+ break;
+ case 'z':
+ *flags |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ *flags |= SLAB_POISON;
+ break;
+ case 'u':
+ *flags |= SLAB_STORE_USER;
+ break;
+ case 't':
+ *flags |= SLAB_TRACE;
+ break;
+ case 'a':
+ *flags |= SLAB_FAILSLAB;
+ break;
+ case 'o':
+ /*
+ * Avoid enabling debugging on caches if its minimum
+ * order would increase as a result.
+ */
+ higher_order_disable = true;
+ break;
+ default:
+ if (init)
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
+ }
+ }
+check_slabs:
+ if (*str == ',')
+ *slabs = ++str;
+ else
+ *slabs = NULL;
+
+ /* Skip over the slab list */
+ while (*str && *str != ';')
+ str++;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (init && higher_order_disable)
+ disable_higher_order_debug = 1;
+
+ if (*str)
+ return str;
+ else
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slab_flags_t flags;
+ char *saved_str;
+ char *slab_list;
+ bool global_slub_debug_changed = false;
+ bool slab_list_specified = false;
+
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ saved_str = str;
+ while (str) {
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+ if (!slab_list) {
+ slub_debug = flags;
+ global_slub_debug_changed = true;
+ } else {
+ slab_list_specified = true;
+ }
+ }
+
+ /*
+ * For backwards compatibility, a single list of flags with list of
+ * slabs means debugging is only enabled for those slabs, so the global
+ * slub_debug should be 0. We can extended that to multiple lists as
+ * long as there is no option specifying flags without a slab list.
+ */
+ if (slab_list_specified) {
+ if (!global_slub_debug_changed)
+ slub_debug = 0;
+ slub_debug_string = saved_str;
+ }
+out:
+ if (slub_debug != 0 || slub_debug_string)
+ static_branch_enable(&slub_debug_enabled);
+ if ((static_branch_unlikely(&init_on_alloc) ||
+ static_branch_unlikely(&init_on_free)) &&
+ (slub_debug & SLAB_POISON))
+ pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
+ return 1;
+}
+
+__setup("slub_debug", setup_slub_debug);
+
+/*
+ * kmem_cache_flags - apply debugging options to the cache
+ * @object_size: the size of an object without meta data
+ * @flags: flags to set
+ * @name: name of the cache
+ *
+ * Debug option(s) are applied to @flags. In addition to the debug
+ * option(s), if a slab name (or multiple) is specified i.e.
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * then only the select slabs will receive the debug option(s).
+ */
+slab_flags_t kmem_cache_flags(unsigned int object_size,
+ slab_flags_t flags, const char *name)
+{
+ char *iter;
+ size_t len;
+ char *next_block;
+ slab_flags_t block_flags;
+
+ len = strlen(name);
+ next_block = slub_debug_string;
+ /* Go through all blocks of debug options, see if any matches our slab's name */
+ while (next_block) {
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+ if (!iter)
+ continue;
+ /* Found a block that has a slab list, search it */
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+ if (next_block && next_block < end)
+ end = next_block - 1;
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
+
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= block_flags;
+ return flags;
+ }
+
+ if (!*end || *end == ';')
+ break;
+ iter = end + 1;
+ }
+ }
+
+ return flags | slub_debug;
+}
+#else /* !CONFIG_SLUB_DEBUG */
+static inline void setup_object_debug(struct kmem_cache *s,
+ struct page *page, void *object) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
+
+static inline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr) { return 0; }
+
+static inline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr) { return 0; }
+
+static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+ { return 1; }
+static inline int check_object(struct kmem_cache *s, struct page *page,
+ void *object, u8 val) { return 1; }
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
+slab_flags_t kmem_cache_flags(unsigned int object_size,
+ slab_flags_t flags, const char *name)
+{
+ return flags;
+}
+#define slub_debug 0
+
+#define disable_higher_order_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+ { return 0; }
+static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
+ { return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ return false;
+}
+#endif /* CONFIG_SLUB_DEBUG */
+
+/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+{
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+ return ptr;
+}
+
+static __always_inline void kfree_hook(void *x)
+{
+ kmemleak_free(x);
+ kasan_kfree_large(x, _RET_IP_);
+}
+
+static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+
+ /*
+ * Trouble is that we may no longer disable interrupts in the fast path
+ * So in order to make the debug calls that expect irqs to be
+ * disabled we need to disable interrupts temporarily.
+ */
+#ifdef CONFIG_LOCKDEP
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ debug_check_no_locks_freed(x, s->object_size);
+ local_irq_restore(flags);
+ }
+#endif
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(x, s->object_size);
+
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(x, s->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
+ /* KASAN might put x into memory quarantine, delaying its reuse */
+ return kasan_slab_free(s, x, _RET_IP_);
+}
+
+static inline bool slab_free_freelist_hook(struct kmem_cache *s,
+ void **head, void **tail,
+ int *cnt)
+{
+
+ void *object;
+ void *next = *head;
+ void *old_tail = *tail ? *tail : *head;
+ int rsize;
+
+ /* Head and tail of the reconstructed freelist */
+ *head = NULL;
+ *tail = NULL;
+
+ do {
+ object = next;
+ next = get_freepointer(s, object);
+
+ if (slab_want_init_on_free(s)) {
+ /*
+ * Clear the object and the metadata, but don't touch
+ * the redzone.
+ */
+ memset(object, 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
+ : 0;
+ memset((char *)object + s->inuse, 0,
+ s->size - s->inuse - rsize);
+
+ }
+ /* If object's reuse doesn't have to be delayed */
+ if (!slab_free_hook(s, object)) {
+ /* Move object to the new freelist */
+ set_freepointer(s, object, *head);
+ *head = object;
+ if (!*tail)
+ *tail = object;
+ } else {
+ /*
+ * Adjust the reconstructed freelist depth
+ * accordingly if object's reuse is delayed.
+ */
+ --(*cnt);
+ }
+ } while (object != old_tail);
+
+ if (*head == *tail)
+ *tail = NULL;
+
+ return *head != NULL;
+}
+
+static void *setup_object(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ setup_object_debug(s, page, object);
+ object = kasan_init_slab_obj(s, object);
+ if (unlikely(s->ctor)) {
+ kasan_unpoison_object_data(s, object);
+ s->ctor(object);
+ kasan_poison_object_data(s, object);
+ }
+ return object;
+}
+
+/*
+ * Slab allocation and freeing
+ */
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, struct kmem_cache_order_objects oo)
+{
+ struct page *page;
+ unsigned int order = oo_order(oo);
+
+ if (node == NUMA_NO_NODE)
+ page = alloc_pages(flags, order);
+ else
+ page = __alloc_pages_node(node, flags, order);
+
+ if (page)
+ account_slab_page(page, order, s);
+
+ return page;
+}
+
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Pre-initialize the random sequence cache */
+static int init_cache_random_seq(struct kmem_cache *s)
+{
+ unsigned int count = oo_objects(s->oo);
+ int err;
+
+ /* Bailout if already initialised */
+ if (s->random_seq)
+ return 0;
+
+ err = cache_random_seq_create(s, count, GFP_KERNEL);
+ if (err) {
+ pr_err("SLUB: Unable to initialize free list for %s\n",
+ s->name);
+ return err;
+ }
+
+ /* Transform to an offset on the set of pages */
+ if (s->random_seq) {
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ s->random_seq[i] *= s->size;
+ }
+ return 0;
+}
+
+/* Initialize each random sequence freelist per cache */
+static void __init init_freelist_randomization(void)
+{
+ struct kmem_cache *s;
+
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list)
+ init_cache_random_seq(s);
+
+ mutex_unlock(&slab_mutex);
+}
+
+/* Get the next entry on the pre-computed freelist randomized */
+static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
+ unsigned long *pos, void *start,
+ unsigned long page_limit,
+ unsigned long freelist_count)
+{
+ unsigned int idx;
+
+ /*
+ * If the target page allocation failed, the number of objects on the
+ * page might be smaller than the usual size defined by the cache.
+ */
+ do {
+ idx = s->random_seq[*pos];
+ *pos += 1;
+ if (*pos >= freelist_count)
+ *pos = 0;
+ } while (unlikely(idx >= page_limit));
+
+ return (char *)start + idx;
+}
+
+/* Shuffle the single linked freelist based on a random pre-computed sequence */
+static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+{
+ void *start;
+ void *cur;
+ void *next;
+ unsigned long idx, pos, page_limit, freelist_count;
+
+ if (page->objects < 2 || !s->random_seq)
+ return false;
+
+ freelist_count = oo_objects(s->oo);
+ pos = get_random_int() % freelist_count;
+
+ page_limit = page->objects * s->size;
+ start = fixup_red_left(s, page_address(page));
+
+ /* First entry is used as the base of the freelist */
+ cur = next_freelist_entry(s, page, &pos, start, page_limit,
+ freelist_count);
+ cur = setup_object(s, page, cur);
+ page->freelist = cur;
+
+ for (idx = 1; idx < page->objects; idx++) {
+ next = next_freelist_entry(s, page, &pos, start, page_limit,
+ freelist_count);
+ next = setup_object(s, page, next);
+ set_freepointer(s, cur, next);
+ cur = next;
+ }
+ set_freepointer(s, cur, NULL);
+
+ return true;
+}
+#else
+static inline int init_cache_random_seq(struct kmem_cache *s)
+{
+ return 0;
+}
+static inline void init_freelist_randomization(void) { }
+static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ struct kmem_cache_order_objects oo = s->oo;
+ gfp_t alloc_gfp;
+ void *start, *p, *next;
+ int idx;
+ bool shuffle;
+
+ flags &= gfp_allowed_mask;
+
+ if (gfpflags_allow_blocking(flags))
+ local_irq_enable();
+
+ flags |= s->allocflags;
+
+ /*
+ * Let the initial higher-order allocation fail under memory pressure
+ * so we fall-back to the minimum order allocation.
+ */
+ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+ if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
+
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
+ if (unlikely(!page)) {
+ oo = s->min;
+ alloc_gfp = flags;
+ /*
+ * Allocation may have failed due to fragmentation.
+ * Try a lower order alloc if possible
+ */
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
+ if (unlikely(!page))
+ goto out;
+ stat(s, ORDER_FALLBACK);
+ }
+
+ page->objects = oo_objects(oo);
+
+ page->slab_cache = s;
+ __SetPageSlab(page);
+ if (page_is_pfmemalloc(page))
+ SetPageSlabPfmemalloc(page);
+
+ kasan_poison_slab(page);
+
+ start = page_address(page);
+
+ setup_page_debug(s, page, start);
+
+ shuffle = shuffle_freelist(s, page);
+
+ if (!shuffle) {
+ start = fixup_red_left(s, start);
+ start = setup_object(s, page, start);
+ page->freelist = start;
+ for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ next = p + s->size;
+ next = setup_object(s, page, next);
+ set_freepointer(s, p, next);
+ p = next;
+ }
+ set_freepointer(s, p, NULL);
+ }
+
+ page->inuse = page->objects;
+ page->frozen = 1;
+
+out:
+ if (gfpflags_allow_blocking(flags))
+ local_irq_disable();
+ if (!page)
+ return NULL;
+
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+
+ return page;
+}
+
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
+ return allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
+static void __free_slab(struct kmem_cache *s, struct page *page)
+{
+ int order = compound_order(page);
+ int pages = 1 << order;
+
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+ void *p;
+
+ slab_pad_check(s, page);
+ for_each_object(p, s, page_address(page),
+ page->objects)
+ check_object(s, page, p, SLUB_RED_INACTIVE);
+ }
+
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+
+ page->mapping = NULL;
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += pages;
+ unaccount_slab_page(page, order, s);
+ __free_pages(page, order);
+}
+
+static void rcu_free_slab(struct rcu_head *h)
+{
+ struct page *page = container_of(h, struct page, rcu_head);
+
+ __free_slab(page->slab_cache, page);
+}
+
+static void free_slab(struct kmem_cache *s, struct page *page)
+{
+ if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
+ call_rcu(&page->rcu_head, rcu_free_slab);
+ } else
+ __free_slab(s, page);
+}
+
+static void discard_slab(struct kmem_cache *s, struct page *page)
+{
+ dec_slabs_node(s, page_to_nid(page), page->objects);
+ free_slab(s, page);
+}
+
+/*
+ * Management of partially allocated slabs.
+ */
+static inline void
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
+{
+ n->nr_partial++;
+ if (tail == DEACTIVATE_TO_TAIL)
+ list_add_tail(&page->slab_list, &n->partial);
+ else
+ list_add(&page->slab_list, &n->partial);
+}
+
+static inline void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
+{
+ lockdep_assert_held(&n->list_lock);
+ __add_partial(n, page, tail);
+}
+
+static inline void remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ lockdep_assert_held(&n->list_lock);
+ list_del(&page->slab_list);
+ n->nr_partial--;
+}
+
+/*
+ * Remove slab from the partial list, freeze it and
+ * return the pointer to the freelist.
+ *
+ * Returns a list of objects or NULL if it fails.
+ */
+static inline void *acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode, int *objects)
+{
+ void *freelist;
+ unsigned long counters;
+ struct page new;
+
+ lockdep_assert_held(&n->list_lock);
+
+ /*
+ * Zap the freelist and set the frozen bit.
+ * The old freelist is the list of objects for the
+ * per cpu allocation list.
+ */
+ freelist = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ *objects = new.objects - new.inuse;
+ if (mode) {
+ new.inuse = page->objects;
+ new.freelist = NULL;
+ } else {
+ new.freelist = freelist;
+ }
+
+ VM_BUG_ON(new.frozen);
+ new.frozen = 1;
+
+ if (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ new.freelist, new.counters,
+ "acquire_slab"))
+ return NULL;
+
+ remove_partial(n, page);
+ WARN_ON(!freelist);
+ return freelist;
+}
+
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
+
+/*
+ * Try to allocate a partial slab from a specific node.
+ */
+static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct kmem_cache_cpu *c, gfp_t flags)
+{
+ struct page *page, *page2;
+ void *object = NULL;
+ unsigned int available = 0;
+ int objects;
+
+ /*
+ * Racy check. If we mistakenly see no partial slabs then we
+ * just allocate an empty slab. If we mistakenly try to get a
+ * partial slab and there is none available then get_partial()
+ * will return NULL.
+ */
+ if (!n || !n->nr_partial)
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
+ void *t;
+
+ if (!pfmemalloc_match(page, flags))
+ continue;
+
+ t = acquire_slab(s, n, page, object == NULL, &objects);
+ if (!t)
+ break;
+
+ available += objects;
+ if (!object) {
+ c->page = page;
+ stat(s, ALLOC_FROM_PARTIAL);
+ object = t;
+ } else {
+ put_cpu_partial(s, page, 0);
+ stat(s, CPU_PARTIAL_NODE);
+ }
+ if (!kmem_cache_has_cpu_partial(s)
+ || available > slub_cpu_partial(s) / 2)
+ break;
+
+ }
+ spin_unlock(&n->list_lock);
+ return object;
+}
+
+/*
+ * Get a page from somewhere. Search in increasing NUMA distances.
+ */
+static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ struct kmem_cache_cpu *c)
+{
+#ifdef CONFIG_NUMA
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(flags);
+ void *object;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * The defrag ratio allows a configuration of the tradeoffs between
+ * inter node defragmentation and node local allocations. A lower
+ * defrag_ratio increases the tendency to do local allocations
+ * instead of attempting to obtain partial slabs from other nodes.
+ *
+ * If the defrag_ratio is set to 0 then kmalloc() always
+ * returns node local objects. If the ratio is higher then kmalloc()
+ * may return off node objects because partial slabs are obtained
+ * from other nodes and filled up.
+ *
+ * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
+ * (which makes defrag_ratio = 1000) then every (well almost)
+ * allocation will first attempt to defrag slab caches on other nodes.
+ * This means scanning over all nodes to look for partial slabs which
+ * may be expensive if we do it every time we are trying to find a slab
+ * with available objects.
+ */
+ if (!s->remote_node_defrag_ratio ||
+ get_cycles() % 1024 > s->remote_node_defrag_ratio)
+ return NULL;
+
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed(zone, flags) &&
+ n->nr_partial > s->min_partial) {
+ object = get_partial_node(s, n, c, flags);
+ if (object) {
+ /*
+ * Don't check read_mems_allowed_retry()
+ * here - if mems_allowed was updated in
+ * parallel, that was a harmless race
+ * between allocation and the cpuset
+ * update
+ */
+ return object;
+ }
+ }
+ }
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+#endif /* CONFIG_NUMA */
+ return NULL;
+}
+
+/*
+ * Get a partial page, lock it and return it.
+ */
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+ struct kmem_cache_cpu *c)
+{
+ void *object;
+ int searchnode = node;
+
+ if (node == NUMA_NO_NODE)
+ searchnode = numa_mem_id();
+
+ object = get_partial_node(s, get_node(s, searchnode), c, flags);
+ if (object || node != NUMA_NO_NODE)
+ return object;
+
+ return get_any_partial(s, flags, c);
+}
+
+#ifdef CONFIG_PREEMPTION
+/*
+ * Calculate the next globally unique transaction for disambiguation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+ return tid + TID_STEP;
+}
+
+#ifdef SLUB_DEBUG_CMPXCHG
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+ return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+ return tid / TID_STEP;
+}
+#endif
+
+static inline unsigned int init_tid(int cpu)
+{
+ return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+ const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ pr_info("%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPTION
+ if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ pr_warn("due to cpu change %d -> %d\n",
+ tid_to_cpu(tid), tid_to_cpu(actual_tid));
+ else
+#endif
+ if (tid_to_event(tid) != tid_to_event(actual_tid))
+ pr_warn("due to cpu running other code. Event %ld->%ld\n",
+ tid_to_event(tid), tid_to_event(actual_tid));
+ else
+ pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ actual_tid, tid, next_tid(tid));
+#endif
+ stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
+}
+
+static void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+}
+
+/*
+ * Remove the cpu slab
+ */
+static void deactivate_slab(struct kmem_cache *s, struct page *page,
+ void *freelist, struct kmem_cache_cpu *c)
+{
+ enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ int lock = 0;
+ enum slab_modes l = M_NONE, m = M_NONE;
+ void *nextfree;
+ int tail = DEACTIVATE_TO_HEAD;
+ struct page new;
+ struct page old;
+
+ if (page->freelist) {
+ stat(s, DEACTIVATE_REMOTE_FREES);
+ tail = DEACTIVATE_TO_TAIL;
+ }
+
+ /*
+ * Stage one: Free all available per cpu objects back
+ * to the page freelist while it is still frozen. Leave the
+ * last one.
+ *
+ * There is no need to take the list->lock because the page
+ * is still frozen.
+ */
+ while (freelist && (nextfree = get_freepointer(s, freelist))) {
+ void *prior;
+ unsigned long counters;
+
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, &freelist, nextfree))
+ break;
+
+ do {
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, freelist, prior);
+ new.counters = counters;
+ new.inuse--;
+ VM_BUG_ON(!new.frozen);
+
+ } while (!__cmpxchg_double_slab(s, page,
+ prior, counters,
+ freelist, new.counters,
+ "drain percpu freelist"));
+
+ freelist = nextfree;
+ }
+
+ /*
+ * Stage two: Ensure that the page is unfrozen while the
+ * list presence reflects the actual number of objects
+ * during unfreeze.
+ *
+ * We setup the list membership and then perform a cmpxchg
+ * with the count. If there is a mismatch then the page
+ * is not unfrozen but the page is on the wrong list.
+ *
+ * Then we restart the process which may have to remove
+ * the page from the list that we just put it on again
+ * because the number of objects in the slab may have
+ * changed.
+ */
+redo:
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ /* Determine target state of the slab */
+ new.counters = old.counters;
+ if (freelist) {
+ new.inuse--;
+ set_freepointer(s, freelist, old.freelist);
+ new.freelist = freelist;
+ } else
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ if (!new.inuse && n->nr_partial >= s->min_partial)
+ m = M_FREE;
+ else if (new.freelist) {
+ m = M_PARTIAL;
+ if (!lock) {
+ lock = 1;
+ /*
+ * Taking the spinlock removes the possibility
+ * that acquire_slab() will see a slab page that
+ * is frozen
+ */
+ spin_lock(&n->list_lock);
+ }
+ } else {
+ m = M_FULL;
+#ifdef CONFIG_SLUB_DEBUG
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
+ lock = 1;
+ /*
+ * This also ensures that the scanning of full
+ * slabs from diagnostic functions will not see
+ * any frozen slabs.
+ */
+ spin_lock(&n->list_lock);
+ }
+#endif
+ }
+
+ if (l != m) {
+ if (l == M_PARTIAL)
+ remove_partial(n, page);
+ else if (l == M_FULL)
+ remove_full(s, n, page);
+
+ if (m == M_PARTIAL)
+ add_partial(n, page, tail);
+ else if (m == M_FULL)
+ add_full(s, n, page);
+ }
+
+ l = m;
+ if (!__cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"))
+ goto redo;
+
+ if (lock)
+ spin_unlock(&n->list_lock);
+
+ if (m == M_PARTIAL)
+ stat(s, tail);
+ else if (m == M_FULL)
+ stat(s, DEACTIVATE_FULL);
+ else if (m == M_FREE) {
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+
+ c->page = NULL;
+ c->freelist = NULL;
+ c->tid = next_tid(c->tid);
+}
+
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
+ */
+static void unfreeze_partials(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct kmem_cache_node *n = NULL, *n2 = NULL;
+ struct page *page, *discard_page = NULL;
+
+ while ((page = slub_percpu_partial(c))) {
+ struct page new;
+ struct page old;
+
+ slub_set_percpu_partial(c, page);
+
+ n2 = get_node(s, page_to_nid(page));
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
+
+ do {
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ new.counters = old.counters;
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"));
+
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
+ page->next = discard_page;
+ discard_page = page;
+ } else {
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ }
+
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ while (discard_page) {
+ page = discard_page;
+ discard_page = discard_page->next;
+
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+}
+
+/*
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct page *oldpage;
+ int pages;
+ int pobjects;
+
+ preempt_disable();
+ do {
+ pages = 0;
+ pobjects = 0;
+ oldpage = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldpage) {
+ pobjects = oldpage->pobjects;
+ pages = oldpage->pages;
+ if (drain && pobjects > slub_cpu_partial(s)) {
+ unsigned long flags;
+ /*
+ * partial array is full. Move the existing
+ * set to the per node partial list.
+ */
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ oldpage = NULL;
+ pobjects = 0;
+ pages = 0;
+ stat(s, CPU_PARTIAL_DRAIN);
+ }
+ }
+
+ pages++;
+ pobjects += page->objects - page->inuse;
+
+ page->pages = pages;
+ page->pobjects = pobjects;
+ page->next = oldpage;
+
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
+ != oldpage);
+ if (unlikely(!slub_cpu_partial(s))) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+}
+
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+{
+ stat(s, CPUSLAB_FLUSH);
+ deactivate_slab(s, c->page, c->freelist, c);
+}
+
+/*
+ * Flush cpu slab.
+ *
+ * Called from IPI handler with interrupts disabled.
+ */
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+{
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ if (c->page)
+ flush_slab(s, c);
+
+ unfreeze_partials(s, c);
+}
+
+static void flush_cpu_slab(void *d)
+{
+ struct kmem_cache *s = d;
+
+ __flush_cpu_slab(s, smp_processor_id());
+}
+
+static bool has_cpu_slab(int cpu, void *info)
+{
+ struct kmem_cache *s = info;
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ return c->page || slub_percpu_partial(c);
+}
+
+static void flush_all(struct kmem_cache *s)
+{
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+}
+
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int slub_cpu_dead(unsigned int cpu)
+{
+ struct kmem_cache *s;
+ unsigned long flags;
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ local_irq_save(flags);
+ __flush_cpu_slab(s, cpu);
+ local_irq_restore(flags);
+ }
+ mutex_unlock(&slab_mutex);
+ return 0;
+}
+
+/*
+ * Check if the objects in a per cpu structure fit numa
+ * locality expectations.
+ */
+static inline int node_match(struct page *page, int node)
+{
+#ifdef CONFIG_NUMA
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+ return 0;
+#endif
+ return 1;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+static int count_free(struct page *page)
+{
+ return page->objects - page->inuse;
+}
+
+static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+{
+ return atomic_long_read(&n->total_objects);
+}
+#endif /* CONFIG_SLUB_DEBUG */
+
+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+static unsigned long count_partial(struct kmem_cache_node *n,
+ int (*get_count)(struct page *))
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, slab_list)
+ x += get_count(page);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+
+static noinline void
+slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ int node;
+ struct kmem_cache_node *n;
+
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
+ return;
+
+ pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
+ s->name, s->object_size, s->size, oo_order(s->oo),
+ oo_order(s->min));
+
+ if (oo_order(s->min) > get_order(s->object_size))
+ pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
+ s->name);
+
+ for_each_kmem_cache_node(s, node, n) {
+ unsigned long nr_slabs;
+ unsigned long nr_objs;
+ unsigned long nr_free;
+
+ nr_free = count_partial(n, count_free);
+ nr_slabs = node_nr_slabs(n);
+ nr_objs = node_nr_objs(n);
+
+ pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
+ node, nr_slabs, nr_objs, nr_free);
+ }
+#endif
+}
+
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+ int node, struct kmem_cache_cpu **pc)
+{
+ void *freelist;
+ struct kmem_cache_cpu *c = *pc;
+ struct page *page;
+
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
+ freelist = get_partial(s, flags, node, c);
+
+ if (freelist)
+ return freelist;
+
+ page = new_slab(s, flags, node);
+ if (page) {
+ c = raw_cpu_ptr(s->cpu_slab);
+ if (c->page)
+ flush_slab(s, c);
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ freelist = page->freelist;
+ page->freelist = NULL;
+
+ stat(s, ALLOC_SLAB);
+ c->page = page;
+ *pc = c;
+ }
+
+ return freelist;
+}
+
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+ if (unlikely(PageSlabPfmemalloc(page)))
+ return gfp_pfmemalloc_allowed(gfpflags);
+
+ return true;
+}
+
+/*
+ * Check the page->freelist of a page and either transfer the freelist to the
+ * per cpu freelist or deactivate the page.
+ *
+ * The page is still frozen if the return value is not NULL.
+ *
+ * If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
+ */
+static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+{
+ struct page new;
+ unsigned long counters;
+ void *freelist;
+
+ do {
+ freelist = page->freelist;
+ counters = page->counters;
+
+ new.counters = counters;
+ VM_BUG_ON(!new.frozen);
+
+ new.inuse = page->objects;
+ new.frozen = freelist != NULL;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ NULL, new.counters,
+ "get_freelist"));
+
+ return freelist;
+}
+
+/*
+ * Slow path. The lockless freelist is empty or we need to perform
+ * debugging duties.
+ *
+ * Processing is still very fast if new objects have been freed to the
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
+ *
+ * If that is not working then we fall back to the partial lists. We take the
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
+ *
+ * And if we were unable to get a new slab from the partial slab lists then
+ * we need to allocate a new slab. This is the slowest path since it involves
+ * a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
+ */
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *freelist;
+ struct page *page;
+
+ stat(s, ALLOC_SLOWPATH);
+
+ page = c->page;
+ if (!page) {
+ /*
+ * if the node is not online or has no normal memory, just
+ * ignore the node constraint
+ */
+ if (unlikely(node != NUMA_NO_NODE &&
+ !node_state(node, N_NORMAL_MEMORY)))
+ node = NUMA_NO_NODE;
+ goto new_slab;
+ }
+redo:
+
+ if (unlikely(!node_match(page, node))) {
+ /*
+ * same as above but node_match() being false already
+ * implies node != NUMA_NO_NODE
+ */
+ if (!node_state(node, N_NORMAL_MEMORY)) {
+ node = NUMA_NO_NODE;
+ goto redo;
+ } else {
+ stat(s, ALLOC_NODE_MISMATCH);
+ deactivate_slab(s, page, c->freelist, c);
+ goto new_slab;
+ }
+ }
+
+ /*
+ * By rights, we should be searching for a slab page that was
+ * PFMEMALLOC but right now, we are losing the pfmemalloc
+ * information when the page leaves the per-cpu allocator
+ */
+ if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+ deactivate_slab(s, page, c->freelist, c);
+ goto new_slab;
+ }
+
+ /* must check again c->freelist in case of cpu migration or IRQ */
+ freelist = c->freelist;
+ if (freelist)
+ goto load_freelist;
+
+ freelist = get_freelist(s, page);
+
+ if (!freelist) {
+ c->page = NULL;
+ c->tid = next_tid(c->tid);
+ stat(s, DEACTIVATE_BYPASS);
+ goto new_slab;
+ }
+
+ stat(s, ALLOC_REFILL);
+
+load_freelist:
+ /*
+ * freelist is pointing to the list of objects to be used.
+ * page is pointing to the page from which the objects are obtained.
+ * That page must be frozen for per cpu allocations to work.
+ */
+ VM_BUG_ON(!c->page->frozen);
+ c->freelist = get_freepointer(s, freelist);
+ c->tid = next_tid(c->tid);
+ return freelist;
+
+new_slab:
+
+ if (slub_percpu_partial(c)) {
+ page = c->page = slub_percpu_partial(c);
+ slub_set_percpu_partial(c, page);
+ stat(s, CPU_PARTIAL_ALLOC);
+ goto redo;
+ }
+
+ freelist = new_slab_objects(s, gfpflags, node, &c);
+
+ if (unlikely(!freelist)) {
+ slab_out_of_memory(s, gfpflags, node);
+ return NULL;
+ }
+
+ page = c->page;
+ if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
+ goto load_freelist;
+
+ /* Only entered in the debug case */
+ if (kmem_cache_debug(s) &&
+ !alloc_debug_processing(s, page, freelist, addr))
+ goto new_slab; /* Slab failed checks. Next slab needed */
+
+ deactivate_slab(s, page, get_freepointer(s, freelist), c);
+ return freelist;
+}
+
+/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPTION
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+ p = ___slab_alloc(s, gfpflags, node, addr, c);
+ local_irq_restore(flags);
+ return p;
+}
+
+/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr)
+{
+ void *object;
+ struct kmem_cache_cpu *c;
+ struct page *page;
+ unsigned long tid;
+ struct obj_cgroup *objcg = NULL;
+
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
+ if (!s)
+ return NULL;
+redo:
+ /*
+ * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+ * enabled. We may switch back and forth between cpus while
+ * reading from one cpu area. That does not matter as long
+ * as we end up on the original cpu again when doing the cmpxchg.
+ *
+ * We should guarantee that tid and kmem_cache are retrieved on
+ * the same cpu. It could be different if CONFIG_PREEMPTION so we need
+ * to check if it is matched or not.
+ */
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
+ unlikely(tid != READ_ONCE(c->tid)));
+
+ /*
+ * Irqless object alloc/free algorithm used here depends on sequence
+ * of fetching cpu_slab's data. tid should be fetched before anything
+ * on c to guarantee that object and page associated with previous tid
+ * won't be used with current tid. If we fetch tid first, object and
+ * page could be one associated with next tid and our alloc/free
+ * request will be failed. In this case, we will retry. So, no problem.
+ */
+ barrier();
+
+ /*
+ * The transaction ids are globally unique per cpu and per operation on
+ * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+ * occurs on the right processor and that there was no operation on the
+ * linked list in between.
+ */
+
+ object = c->freelist;
+ page = c->page;
+ if (unlikely(!object || !page || !node_match(page, node))) {
+ object = __slab_alloc(s, gfpflags, node, addr, c);
+ } else {
+ void *next_object = get_freepointer_safe(s, object);
+
+ /*
+ * The cmpxchg will only match if there was no additional
+ * operation and if we are on the right processor.
+ *
+ * The cmpxchg does the following atomically (without lock
+ * semantics!)
+ * 1. Relocate first pointer to the current per cpu area.
+ * 2. Verify that tid and freelist have not been changed
+ * 3. If they were not changed replace tid and freelist
+ *
+ * Since this is without lock semantics the protection is only
+ * against code executing on this cpu *not* from access by
+ * other cpus.
+ */
+ if (unlikely(!this_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ object, tid,
+ next_object, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_alloc", s, tid);
+ goto redo;
+ }
+ prefetch_freepointer(s, next_object);
+ stat(s, ALLOC_FASTPATH);
+ }
+
+ maybe_wipe_obj_freeptr(s, object);
+
+ if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
+ memset(object, 0, s->object_size);
+
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+
+ return object;
+}
+
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, unsigned long addr)
+{
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+
+ trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
+ s->size, gfpflags);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
+
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+
+ trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ s->object_size, s->size, gfpflags, node);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+ gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
+#endif /* CONFIG_NUMA */
+
+/*
+ * Slow path handling. This may still be called frequently since objects
+ * have a longer lifetime than the cpu slabs in most processing loads.
+ *
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
+ */
+static void __slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+
+{
+ void *prior;
+ int was_frozen;
+ struct page new;
+ unsigned long counters;
+ struct kmem_cache_node *n = NULL;
+ unsigned long flags;
+
+ stat(s, FREE_SLOWPATH);
+
+ if (kmem_cache_debug(s) &&
+ !free_debug_processing(s, page, head, tail, cnt, addr))
+ return;
+
+ do {
+ if (unlikely(n)) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ n = NULL;
+ }
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, tail, prior);
+ new.counters = counters;
+ was_frozen = new.frozen;
+ new.inuse -= cnt;
+ if ((!new.inuse || !prior) && !was_frozen) {
+
+ if (kmem_cache_has_cpu_partial(s) && !prior) {
+
+ /*
+ * Slab was on no list before and will be
+ * partially empty
+ * We can defer the list move and instead
+ * freeze it.
+ */
+ new.frozen = 1;
+
+ } else { /* Needs to be taken off a list */
+
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ }
+ }
+
+ } while (!cmpxchg_double_slab(s, page,
+ prior, counters,
+ head, new.counters,
+ "__slab_free"));
+
+ if (likely(!n)) {
+
+ if (likely(was_frozen)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ stat(s, FREE_FROZEN);
+ } else if (new.frozen) {
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
+ put_cpu_partial(s, page, 1);
+ stat(s, CPU_PARTIAL_FREE);
+ }
+
+ return;
+ }
+
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
+ goto slab_empty;
+
+ /*
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
+ */
+ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
+ remove_full(s, n, page);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return;
+
+slab_empty:
+ if (prior) {
+ /*
+ * Slab on the partial list.
+ */
+ remove_partial(n, page);
+ stat(s, FREE_REMOVE_PARTIAL);
+ } else {
+ /* Slab must be on the full list */
+ remove_full(s, n, page);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ stat(s, FREE_SLAB);
+ discard_slab(s, page);
+}
+
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
+ */
+static __always_inline void do_slab_free(struct kmem_cache *s,
+ struct page *page, void *head, void *tail,
+ int cnt, unsigned long addr)
+{
+ void *tail_obj = tail ? : head;
+ struct kmem_cache_cpu *c;
+ unsigned long tid;
+
+ /* memcg_slab_free_hook() is already called for bulk free. */
+ if (!tail)
+ memcg_slab_free_hook(s, &head, 1);
+redo:
+ /*
+ * Determine the currently cpus per cpu slab.
+ * The cpu may change afterward. However that does not matter since
+ * data is retrieved via this pointer. If we are on the same cpu
+ * during the cmpxchg then the free will succeed.
+ */
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
+ unlikely(tid != READ_ONCE(c->tid)));
+
+ /* Same with comment on barrier() in slab_alloc_node() */
+ barrier();
+
+ if (likely(page == c->page)) {
+ void **freelist = READ_ONCE(c->freelist);
+
+ set_freepointer(s, tail_obj, freelist);
+
+ if (unlikely(!this_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ freelist, tid,
+ head, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_free", s, tid);
+ goto redo;
+ }
+ stat(s, FREE_FASTPATH);
+ } else
+ __slab_free(s, page, head, tail_obj, cnt, addr);
+
+}
+
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+{
+ /*
+ * With KASAN enabled slab_free_freelist_hook modifies the freelist
+ * to remove objects, whose reuse must be delayed.
+ */
+ if (slab_free_freelist_hook(s, &head, &tail, &cnt))
+ do_slab_free(s, page, head, tail, cnt, addr);
+}
+
+#ifdef CONFIG_KASAN_GENERIC
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
+{
+ do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+}
+#endif
+
+void kmem_cache_free(struct kmem_cache *s, void *x)
+{
+ s = cache_from_obj(s, x);
+ if (!s)
+ return;
+ slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
+ trace_kmem_cache_free(_RET_IP_, x);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+struct detached_freelist {
+ struct page *page;
+ void *tail;
+ void *freelist;
+ int cnt;
+ struct kmem_cache *s;
+};
+
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page. It builds a detached freelist directly within the given
+ * page/objects. This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive. Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static inline
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
+{
+ size_t first_skipped_index = 0;
+ int lookahead = 3;
+ void *object;
+ struct page *page;
+
+ /* Always re-init detached_freelist */
+ df->page = NULL;
+
+ do {
+ object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
+ } while (!object && size);
+
+ if (!object)
+ return 0;
+
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
+
+ /* Start new detached freelist */
+ df->page = page;
+ set_freepointer(df->s, object, NULL);
+ df->tail = object;
+ df->freelist = object;
+ p[size] = NULL; /* mark object processed */
+ df->cnt = 1;
+
+ while (size) {
+ object = p[--size];
+ if (!object)
+ continue; /* Skip processed objects */
+
+ /* df->page is always set at this point */
+ if (df->page == virt_to_head_page(object)) {
+ /* Opportunity build freelist */
+ set_freepointer(df->s, object, df->freelist);
+ df->freelist = object;
+ df->cnt++;
+ p[size] = NULL; /* mark object processed */
+
+ continue;
+ }
+
+ /* Limit look ahead search */
+ if (!--lookahead)
+ break;
+
+ if (!first_skipped_index)
+ first_skipped_index = size + 1;
+ }
+
+ return first_skipped_index;
+}
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ if (WARN_ON(!size))
+ return;
+
+ memcg_slab_free_hook(s, p, size);
+ do {
+ struct detached_freelist df;
+
+ size = build_detached_freelist(s, size, p, &df);
+ if (!df.page)
+ continue;
+
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
+ } while (likely(size));
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+/* Note that interrupts must be enabled when calling this function. */
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ struct kmem_cache_cpu *c;
+ int i;
+ struct obj_cgroup *objcg = NULL;
+
+ /* memcg and kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
+ if (unlikely(!s))
+ return false;
+ /*
+ * Drain objects in the per cpu slab, while disabling local
+ * IRQs, which protects against PREEMPT and interrupts
+ * handlers invoking normal fastpath.
+ */
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ for (i = 0; i < size; i++) {
+ void *object = c->freelist;
+
+ if (unlikely(!object)) {
+ /*
+ * We may have removed an object from c->freelist using
+ * the fastpath in the previous iteration; in that case,
+ * c->tid has not been bumped yet.
+ * Since ___slab_alloc() may reenable interrupts while
+ * allocating memory, we should bump c->tid now.
+ */
+ c->tid = next_tid(c->tid);
+
+ /*
+ * Invoking slow path likely have side-effect
+ * of re-populating per CPU c->freelist
+ */
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
+ _RET_IP_, c);
+ if (unlikely(!p[i]))
+ goto error;
+
+ c = this_cpu_ptr(s->cpu_slab);
+ maybe_wipe_obj_freeptr(s, p[i]);
+
+ continue; /* goto for-loop */
+ }
+ c->freelist = get_freepointer(s, object);
+ p[i] = object;
+ maybe_wipe_obj_freeptr(s, p[i]);
+ }
+ c->tid = next_tid(c->tid);
+ local_irq_enable();
+
+ /* Clear memory outside IRQ disabled fastpath loop */
+ if (unlikely(slab_want_init_on_alloc(flags, s))) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ memset(p[j], 0, s->object_size);
+ }
+
+ /* memcg and kmem_cache debug support */
+ slab_post_alloc_hook(s, objcg, flags, size, p);
+ return i;
+error:
+ local_irq_enable();
+ slab_post_alloc_hook(s, objcg, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
+/*
+ * Object placement in a slab is made very easy because we always start at
+ * offset 0. If we tune the size of the object to the alignment then we can
+ * get the required alignment by putting one properly sized object after
+ * another.
+ *
+ * Notice that the allocation order determines the sizes of the per cpu
+ * caches. Each processor has always one slab available for allocations.
+ * Increasing the allocation order reduces the number of times that slabs
+ * must be moved on and off the partial lists and is therefore a factor in
+ * locking overhead.
+ */
+
+/*
+ * Mininum / Maximum order of slab pages. This influences locking overhead
+ * and slab fragmentation. A higher order reduces the number of partial slabs
+ * and increases the number of allocations possible without having to
+ * take the list_lock.
+ */
+static unsigned int slub_min_order;
+static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static unsigned int slub_min_objects;
+
+/*
+ * Calculate the order of allocation given an slab object size.
+ *
+ * The order of allocation has significant impact on performance and other
+ * system components. Generally order 0 allocations should be preferred since
+ * order 0 does not cause fragmentation in the page allocator. Larger objects
+ * be problematic to put into order 0 slabs because there may be too much
+ * unused space left. We go to a higher order if more than 1/16th of the slab
+ * would be wasted.
+ *
+ * In order to reach satisfactory performance we must ensure that a minimum
+ * number of objects is in one slab. Otherwise we may generate too much
+ * activity on the partial lists which requires taking the list_lock. This is
+ * less a concern for large slabs though which are rarely used.
+ *
+ * slub_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slub_max_order then
+ * we try to keep the page order as low as possible. So we accept more waste
+ * of space in favor of a small page order.
+ *
+ * Higher order allocations also allow the placement of more objects in a
+ * slab and thereby reduce object handling overhead. If the user has
+ * requested a higher mininum order then we start with that one instead of
+ * the smallest order which will fit the object.
+ */
+static inline unsigned int slab_order(unsigned int size,
+ unsigned int min_objects, unsigned int max_order,
+ unsigned int fract_leftover)
+{
+ unsigned int min_order = slub_min_order;
+ unsigned int order;
+
+ if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
+ return get_order(size * MAX_OBJS_PER_PAGE) - 1;
+
+ for (order = max(min_order, (unsigned int)get_order(min_objects * size));
+ order <= max_order; order++) {
+
+ unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
+ unsigned int rem;
+
+ rem = slab_size % size;
+
+ if (rem <= slab_size / fract_leftover)
+ break;
+ }
+
+ return order;
+}
+
+static inline int calculate_order(unsigned int size)
+{
+ unsigned int order;
+ unsigned int min_objects;
+ unsigned int max_objects;
+
+ /*
+ * Attempt to find best configuration for a slab. This
+ * works by first attempting to generate a layout with
+ * the best configuration and backing off gradually.
+ *
+ * First we increase the acceptable waste in a slab. Then
+ * we reduce the minimum objects required in a slab.
+ */
+ min_objects = slub_min_objects;
+ if (!min_objects)
+ min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ max_objects = order_objects(slub_max_order, size);
+ min_objects = min(min_objects, max_objects);
+
+ while (min_objects > 1) {
+ unsigned int fraction;
+
+ fraction = 16;
+ while (fraction >= 4) {
+ order = slab_order(size, min_objects,
+ slub_max_order, fraction);
+ if (order <= slub_max_order)
+ return order;
+ fraction /= 2;
+ }
+ min_objects--;
+ }
+
+ /*
+ * We were unable to place multiple objects in a slab. Now
+ * lets see if we can place a single object there.
+ */
+ order = slab_order(size, 1, slub_max_order, 1);
+ if (order <= slub_max_order)
+ return order;
+
+ /*
+ * Doh this slab cannot be placed using slub_max_order.
+ */
+ order = slab_order(size, 1, MAX_ORDER, 1);
+ if (order < MAX_ORDER)
+ return order;
+ return -ENOSYS;
+}
+
+static void
+init_kmem_cache_node(struct kmem_cache_node *n)
+{
+ n->nr_partial = 0;
+ spin_lock_init(&n->list_lock);
+ INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_set(&n->nr_slabs, 0);
+ atomic_long_set(&n->total_objects, 0);
+ INIT_LIST_HEAD(&n->full);
+#endif
+}
+
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
+ KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+
+ /*
+ * Must align to double word boundary for the double cmpxchg
+ * instructions to work; see __pcpu_double_call_return_bool().
+ */
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+ 2 * sizeof(void *));
+
+ if (!s->cpu_slab)
+ return 0;
+
+ init_kmem_cache_cpus(s);
+
+ return 1;
+}
+
+static struct kmem_cache *kmem_cache_node;
+
+/*
+ * No kmalloc_node yet so do it by hand. We know that this is the first
+ * slab on the node for this slabcache. There are no concurrent accesses
+ * possible.
+ *
+ * Note that this function only works on the kmem_cache_node
+ * when allocating for the kmem_cache_node. This is used for bootstrapping
+ * memory on a fresh node that has no slab structures yet.
+ */
+static void early_kmem_cache_node_alloc(int node)
+{
+ struct page *page;
+ struct kmem_cache_node *n;
+
+ BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
+
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+
+ BUG_ON(!page);
+ if (page_to_nid(page) != node) {
+ pr_err("SLUB: Unable to allocate memory from node %d\n", node);
+ pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
+ }
+
+ n = page->freelist;
+ BUG_ON(!n);
+#ifdef CONFIG_SLUB_DEBUG
+ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
+ init_tracking(kmem_cache_node, n);
+#endif
+ n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ GFP_KERNEL);
+ page->freelist = get_freepointer(kmem_cache_node, n);
+ page->inuse = 1;
+ page->frozen = 0;
+ kmem_cache_node->node[node] = n;
+ init_kmem_cache_node(n);
+ inc_slabs_node(kmem_cache_node, node, page->objects);
+
+ /*
+ * No locks need to be taken here as it has just been
+ * initialized and there is no concurrent access.
+ */
+ __add_partial(n, page, DEACTIVATE_TO_HEAD);
+}
+
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n) {
+ s->node[node] = NULL;
+ kmem_cache_free(kmem_cache_node, n);
+ }
+}
+
+void __kmem_cache_release(struct kmem_cache *s)
+{
+ cache_random_seq_destroy(s);
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+}
+
+static int init_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ if (slab_state == DOWN) {
+ early_kmem_cache_node_alloc(node);
+ continue;
+ }
+ n = kmem_cache_alloc_node(kmem_cache_node,
+ GFP_KERNEL, node);
+
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
+ }
+
+ init_kmem_cache_node(n);
+ s->node[node] = n;
+ }
+ return 1;
+}
+
+static void set_min_partial(struct kmem_cache *s, unsigned long min)
+{
+ if (min < MIN_PARTIAL)
+ min = MIN_PARTIAL;
+ else if (min > MAX_PARTIAL)
+ min = MAX_PARTIAL;
+ s->min_partial = min;
+}
+
+static void set_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch
+ * 50% to keep some capacity around for frees.
+ */
+ if (!kmem_cache_has_cpu_partial(s))
+ slub_set_cpu_partial(s, 0);
+ else if (s->size >= PAGE_SIZE)
+ slub_set_cpu_partial(s, 2);
+ else if (s->size >= 1024)
+ slub_set_cpu_partial(s, 6);
+ else if (s->size >= 256)
+ slub_set_cpu_partial(s, 13);
+ else
+ slub_set_cpu_partial(s, 30);
+#endif
+}
+
+/*
+ * calculate_sizes() determines the order and the distribution of data within
+ * a slab object.
+ */
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
+{
+ slab_flags_t flags = s->flags;
+ unsigned int size = s->object_size;
+ unsigned int order;
+
+ /*
+ * Round up object size to the next word boundary. We can only
+ * place the free pointer at word boundaries and this determines
+ * the possible location of the free pointer.
+ */
+ size = ALIGN(size, sizeof(void *));
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Determine if we can poison the object itself. If the user of
+ * the slab may touch the object after free or before allocation
+ * then we should never poison the object itself.
+ */
+ if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
+ !s->ctor)
+ s->flags |= __OBJECT_POISON;
+ else
+ s->flags &= ~__OBJECT_POISON;
+
+
+ /*
+ * If we are Redzoning then check if there is some space between the
+ * end of the object and the free pointer. If not then add an
+ * additional word to have some bytes to store Redzone information.
+ */
+ if ((flags & SLAB_RED_ZONE) && size == s->object_size)
+ size += sizeof(void *);
+#endif
+
+ /*
+ * With that we have determined the number of bytes in actual use
+ * by the object and redzoning.
+ */
+ s->inuse = size;
+
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
+ /*
+ * Relocate free pointer after the object if it is not
+ * permitted to overwrite the first word of the object on
+ * kmem_cache_free.
+ *
+ * This is the case if we do RCU, have a constructor or
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
+ *
+ * The assumption that s->offset >= s->inuse means free
+ * pointer is outside of the object is used in the
+ * freeptr_outside_object() function. If that is no
+ * longer true, the function needs to be modified.
+ */
+ s->offset = size;
+ size += sizeof(void *);
+ } else {
+ /*
+ * Store freelist pointer near middle of object to keep
+ * it away from the edges of the object to avoid small
+ * sized over/underflows from neighboring allocations.
+ */
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
+ }
+
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SLAB_STORE_USER)
+ /*
+ * Need to store information about allocs and frees after
+ * the object.
+ */
+ size += 2 * sizeof(struct track);
+#endif
+
+ kasan_cache_create(s, &size, &s->flags);
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SLAB_RED_ZONE) {
+ /*
+ * Add some empty padding so that we can catch
+ * overwrites from earlier objects rather than let
+ * tracking information or the free pointer be
+ * corrupted if a user writes before the start
+ * of the object.
+ */
+ size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
+#endif
+
+ /*
+ * SLUB stores one object immediately after another beginning from
+ * offset 0. In order to align the objects we have to simply size
+ * each object to conform to the alignment.
+ */
+ size = ALIGN(size, s->align);
+ s->size = size;
+ s->reciprocal_size = reciprocal_value(size);
+ if (forced_order >= 0)
+ order = forced_order;
+ else
+ order = calculate_order(size);
+
+ if ((int)order < 0)
+ return 0;
+
+ s->allocflags = 0;
+ if (order)
+ s->allocflags |= __GFP_COMP;
+
+ if (s->flags & SLAB_CACHE_DMA)
+ s->allocflags |= GFP_DMA;
+
+ if (s->flags & SLAB_CACHE_DMA32)
+ s->allocflags |= GFP_DMA32;
+
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ s->allocflags |= __GFP_RECLAIMABLE;
+
+ /*
+ * Determine the number of objects per slab
+ */
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
+ if (oo_objects(s->oo) > oo_objects(s->max))
+ s->max = s->oo;
+
+ return !!oo_objects(s->oo);
+}
+
+static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
+{
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ s->random = get_random_long();
+#endif
+
+ if (!calculate_sizes(s, -1))
+ goto error;
+ if (disable_higher_order_debug) {
+ /*
+ * Disable debugging flags that store metadata if the min slab
+ * order increased.
+ */
+ if (get_order(s->size) > get_order(s->object_size)) {
+ s->flags &= ~DEBUG_METADATA_FLAGS;
+ s->offset = 0;
+ if (!calculate_sizes(s, -1))
+ goto error;
+ }
+ }
+
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+ /* Enable fast mode */
+ s->flags |= __CMPXCHG_DOUBLE;
+#endif
+
+ /*
+ * The larger the object size is, the more pages we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ set_min_partial(s, ilog2(s->size) / 2);
+
+ set_cpu_partial(s);
+
+#ifdef CONFIG_NUMA
+ s->remote_node_defrag_ratio = 1000;
+#endif
+
+ /* Initialize the pre-computed randomized freelist if slab is up */
+ if (slab_state >= UP) {
+ if (init_cache_random_seq(s))
+ goto error;
+ }
+
+ if (!init_kmem_cache_nodes(s))
+ goto error;
+
+ if (alloc_kmem_cache_cpus(s))
+ return 0;
+
+error:
+ __kmem_cache_release(s);
+ return -EINVAL;
+}
+
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+ const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ void *addr = page_address(page);
+ unsigned long *map;
+ void *p;
+
+ slab_err(s, page, text, s->name);
+ slab_lock(page);
+
+ map = get_map(s, page);
+ for_each_object(p, s, addr, page->objects) {
+
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
+ pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ print_tracking(s, p);
+ }
+ }
+ put_map(map);
+ slab_unlock(page);
+#endif
+}
+
+/*
+ * Attempt to free all partial slabs on a node.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
+ */
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+{
+ LIST_HEAD(discard);
+ struct page *page, *h;
+
+ BUG_ON(irqs_disabled());
+ spin_lock_irq(&n->list_lock);
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
+ if (!page->inuse) {
+ remove_partial(n, page);
+ list_add(&page->slab_list, &discard);
+ } else {
+ list_slab_objects(s, page,
+ "Objects remaining in %s on __kmem_cache_shutdown()");
+ }
+ }
+ spin_unlock_irq(&n->list_lock);
+
+ list_for_each_entry_safe(page, h, &discard, slab_list)
+ discard_slab(s, page);
+}
+
+bool __kmem_cache_empty(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n)
+ if (n->nr_partial || slabs_node(s, node))
+ return false;
+ return true;
+}
+
+/*
+ * Release all resources used by a slab cache.
+ */
+int __kmem_cache_shutdown(struct kmem_cache *s)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ flush_all(s);
+ /* Attempt to free all objects */
+ for_each_kmem_cache_node(s, node, n) {
+ free_partial(s, n);
+ if (n->nr_partial || slabs_node(s, node))
+ return 1;
+ }
+ return 0;
+}
+
+/********************************************************************
+ * Kmalloc subsystem
+ *******************************************************************/
+
+static int __init setup_slub_min_order(char *str)
+{
+ get_option(&str, (int *)&slub_min_order);
+
+ return 1;
+}
+
+__setup("slub_min_order=", setup_slub_min_order);
+
+static int __init setup_slub_max_order(char *str)
+{
+ get_option(&str, (int *)&slub_max_order);
+ slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
+
+ return 1;
+}
+
+__setup("slub_max_order=", setup_slub_max_order);
+
+static int __init setup_slub_min_objects(char *str)
+{
+ get_option(&str, (int *)&slub_min_objects);
+
+ return 1;
+}
+
+__setup("slub_min_objects=", setup_slub_min_objects);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large(size, flags);
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc(s, flags, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+
+ ret = kasan_kmalloc(s, ret, size, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc);
+
+#ifdef CONFIG_NUMA
+static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+{
+ struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
+
+ flags |= __GFP_COMP;
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+ }
+
+ return kmalloc_large_node_hook(ptr, size, flags);
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, PAGE_SIZE << get_order(size),
+ flags, node);
+
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc_node(s, flags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+
+ ret = kasan_kmalloc(s, ret, size, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects incorrectly sized objects and objects that are to be copied
+ * to/from userspace but do not fall entirely within the containing slab
+ * cache's usercopy region.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
+ bool to_user)
+{
+ struct kmem_cache *s;
+ unsigned int offset;
+ size_t object_size;
+
+ ptr = kasan_reset_tag(ptr);
+
+ /* Find object and usable object size. */
+ s = page->slab_cache;
+
+ /* Reject impossible pointers. */
+ if (ptr < page_address(page))
+ usercopy_abort("SLUB object not in SLUB page?!", NULL,
+ to_user, 0, n);
+
+ /* Find offset within object. */
+ offset = (ptr - page_address(page)) % s->size;
+
+ /* Adjust for redzone and reject if within the redzone. */
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+ if (offset < s->red_left_pad)
+ usercopy_abort("SLUB object in left red zone",
+ s->name, to_user, offset, n);
+ offset -= s->red_left_pad;
+ }
+
+ /* Allow address range falling entirely within usercopy region. */
+ if (offset >= s->useroffset &&
+ offset - s->useroffset <= s->usersize &&
+ n <= s->useroffset - offset + s->usersize)
+ return;
+
+ /*
+ * If the copy is still within the allocated object, produce
+ * a warning instead of rejecting the copy. This is intended
+ * to be a temporary method to find any missing usercopy
+ * whitelists.
+ */
+ object_size = slab_ksize(s);
+ if (usercopy_fallback &&
+ offset <= object_size && n <= object_size - offset) {
+ usercopy_warn("SLUB object", s->name, to_user, offset, n);
+ return;
+ }
+
+ usercopy_abort("SLUB object", s->name, to_user, offset, n);
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
+size_t __ksize(const void *object)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page))) {
+ WARN_ON(!PageCompound(page));
+ return page_size(page);
+ }
+
+ return slab_ksize(page->slab_cache);
+}
+EXPORT_SYMBOL(__ksize);
+
+void kfree(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+
+ trace_kfree(_RET_IP_, x);
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return;
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(page, order);
+ return;
+ }
+ slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+#define SHRINK_PROMOTE_MAX 32
+
+/*
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
+ *
+ * The slabs with the least items are placed last. This results in them
+ * being allocated from last increasing the chance that the last objects
+ * are freed in them.
+ */
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+ int i;
+ struct kmem_cache_node *n;
+ struct page *page;
+ struct page *t;
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
+ unsigned long flags;
+ int ret = 0;
+
+ flush_all(s);
+ for_each_kmem_cache_node(s, node, n) {
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ /*
+ * Build lists of slabs to discard or promote.
+ *
+ * Note that concurrent frees may occur while we hold the
+ * list_lock. page->inuse here is the upper limit.
+ */
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->slab_list, &discard);
+ n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->slab_list, promote + free - 1);
+ }
+
+ /*
+ * Promote the slabs filled up most to the head of the
+ * partial list.
+ */
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Release empty slabs */
+ list_for_each_entry_safe(page, t, &discard, slab_list)
+ discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int slab_mem_going_offline_callback(void *arg)
+{
+ struct kmem_cache *s;
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list)
+ __kmem_cache_shrink(s);
+ mutex_unlock(&slab_mutex);
+
+ return 0;
+}
+
+static void slab_mem_offline_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int offline_node;
+
+ offline_node = marg->status_change_nid_normal;
+
+ /*
+ * If the node still has available memory. we need kmem_cache_node
+ * for it yet.
+ */
+ if (offline_node < 0)
+ return;
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ n = get_node(s, offline_node);
+ if (n) {
+ /*
+ * if n->nr_slabs > 0, slabs still exist on the node
+ * that is going down. We were unable to free them,
+ * and offline_pages() function shouldn't call this
+ * callback. So, we must fail.
+ */
+ BUG_ON(slabs_node(s, offline_node));
+
+ s->node[offline_node] = NULL;
+ kmem_cache_free(kmem_cache_node, n);
+ }
+ }
+ mutex_unlock(&slab_mutex);
+}
+
+static int slab_mem_going_online_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int nid = marg->status_change_nid_normal;
+ int ret = 0;
+
+ /*
+ * If the node's memory is already available, then kmem_cache_node is
+ * already created. Nothing to do.
+ */
+ if (nid < 0)
+ return 0;
+
+ /*
+ * We are bringing a node online. No memory is available yet. We must
+ * allocate a kmem_cache_node structure in order to bring the node
+ * online.
+ */
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ /*
+ * XXX: kmem_cache_alloc_node will fallback to other nodes
+ * since memory is not yet available from the node that
+ * is brought up.
+ */
+ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ init_kmem_cache_node(n);
+ s->node[nid] = n;
+ }
+out:
+ mutex_unlock(&slab_mutex);
+ return ret;
+}
+
+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = slab_mem_going_online_callback(arg);
+ break;
+ case MEM_GOING_OFFLINE:
+ ret = slab_mem_going_offline_callback(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ slab_mem_offline_callback(arg);
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+ return ret;
+}
+
+static struct notifier_block slab_memory_callback_nb = {
+ .notifier_call = slab_memory_callback,
+ .priority = SLAB_CALLBACK_PRI,
+};
+
+/********************************************************************
+ * Basic setup of slabs
+ *******************************************************************/
+
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
+ */
+
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
+{
+ int node;
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ struct kmem_cache_node *n;
+
+ memcpy(s, static_cache, kmem_cache->object_size);
+
+ /*
+ * This runs very early, and only the boot processor is supposed to be
+ * up. Even if it weren't true, IRQs are not up so we couldn't fire
+ * IPIs around.
+ */
+ __flush_cpu_slab(s, smp_processor_id());
+ for_each_kmem_cache_node(s, node, n) {
+ struct page *p;
+
+ list_for_each_entry(p, &n->partial, slab_list)
+ p->slab_cache = s;
+
+#ifdef CONFIG_SLUB_DEBUG
+ list_for_each_entry(p, &n->full, slab_list)
+ p->slab_cache = s;
+#endif
+ }
+ list_add(&s->list, &slab_caches);
+ return s;
+}
+
+void __init kmem_cache_init(void)
+{
+ static __initdata struct kmem_cache boot_kmem_cache,
+ boot_kmem_cache_node;
+
+ if (debug_guardpage_minorder())
+ slub_max_order = 0;
+
+ kmem_cache_node = &boot_kmem_cache_node;
+ kmem_cache = &boot_kmem_cache;
+
+ create_boot_cache(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
+
+ register_hotmemory_notifier(&slab_memory_callback_nb);
+
+ /* Able to allocate the per node structures */
+ slab_state = PARTIAL;
+
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN, 0, 0);
+
+ kmem_cache = bootstrap(&boot_kmem_cache);
+ kmem_cache_node = bootstrap(&boot_kmem_cache_node);
+
+ /* Now we can use the kmem_cache to allocate kmalloc slabs */
+ setup_kmalloc_cache_index_table();
+ create_kmalloc_caches(0);
+
+ /* Setup random freelists for each cache */
+ init_freelist_randomization();
+
+ cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
+ slub_cpu_dead);
+
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
+ cache_line_size(),
+ slub_min_order, slub_max_order, slub_min_objects,
+ nr_cpu_ids, nr_node_ids);
+}
+
+void __init kmem_cache_init_late(void)
+{
+}
+
+struct kmem_cache *
+__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
+ slab_flags_t flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ s = find_mergeable(size, align, flags, name, ctor);
+ if (s) {
+ s->refcount++;
+
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ s->object_size = max(s->object_size, size);
+ s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
+
+ if (sysfs_slab_alias(s, name)) {
+ s->refcount--;
+ s = NULL;
+ }
+ }
+
+ return s;
+}
+
+int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
+{
+ int err;
+
+ err = kmem_cache_open(s, flags);
+ if (err)
+ return err;
+
+ /* Mutex is not taken during early boot */
+ if (slab_state <= UP)
+ return 0;
+
+ err = sysfs_slab_add(s);
+ if (err)
+ __kmem_cache_release(s);
+
+ return err;
+}
+
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large(size, gfpflags);
+
+ s = kmalloc_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc(s, gfpflags, caller);
+
+ /* Honor the call site pointer we received. */
+ trace_kmalloc(caller, ret, size, s->size, gfpflags);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+ int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = kmalloc_large_node(size, gfpflags, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << get_order(size),
+ gfpflags, node);
+
+ return ret;
+ }
+
+ s = kmalloc_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc_node(s, gfpflags, node, caller);
+
+ /* Honor the call site pointer we received. */
+ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#endif
+
+#ifdef CONFIG_SYSFS
+static int count_inuse(struct page *page)
+{
+ return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+ return page->objects;
+}
+#endif
+
+#ifdef CONFIG_SLUB_DEBUG
+static void validate_slab(struct kmem_cache *s, struct page *page)
+{
+ void *p;
+ void *addr = page_address(page);
+ unsigned long *map;
+
+ slab_lock(page);
+
+ if (!check_slab(s, page) || !on_freelist(s, page, NULL))
+ goto unlock;
+
+ /* Now we know that a valid freelist exists */
+ map = get_map(s, page);
+ for_each_object(p, s, addr, page->objects) {
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
+ SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
+
+ if (!check_object(s, page, p, val))
+ break;
+ }
+ put_map(map);
+unlock:
+ slab_unlock(page);
+}
+
+static int validate_slab_node(struct kmem_cache *s,
+ struct kmem_cache_node *n)
+{
+ unsigned long count = 0;
+ struct page *page;
+ unsigned long flags;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry(page, &n->partial, slab_list) {
+ validate_slab(s, page);
+ count++;
+ }
+ if (count != n->nr_partial)
+ pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
+ s->name, count, n->nr_partial);
+
+ if (!(s->flags & SLAB_STORE_USER))
+ goto out;
+
+ list_for_each_entry(page, &n->full, slab_list) {
+ validate_slab(s, page);
+ count++;
+ }
+ if (count != atomic_long_read(&n->nr_slabs))
+ pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
+ s->name, count, atomic_long_read(&n->nr_slabs));
+
+out:
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return count;
+}
+
+static long validate_slab_cache(struct kmem_cache *s)
+{
+ int node;
+ unsigned long count = 0;
+ struct kmem_cache_node *n;
+
+ flush_all(s);
+ for_each_kmem_cache_node(s, node, n)
+ count += validate_slab_node(s, n);
+
+ return count;
+}
+/*
+ * Generate lists of code addresses where slabcache objects are allocated
+ * and freed.
+ */
+
+struct location {
+ unsigned long count;
+ unsigned long addr;
+ long long sum_time;
+ long min_time;
+ long max_time;
+ long min_pid;
+ long max_pid;
+ DECLARE_BITMAP(cpus, NR_CPUS);
+ nodemask_t nodes;
+};
+
+struct loc_track {
+ unsigned long max;
+ unsigned long count;
+ struct location *loc;
+};
+
+static void free_loc_track(struct loc_track *t)
+{
+ if (t->max)
+ free_pages((unsigned long)t->loc,
+ get_order(sizeof(struct location) * t->max));
+}
+
+static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
+{
+ struct location *l;
+ int order;
+
+ order = get_order(sizeof(struct location) * max);
+
+ l = (void *)__get_free_pages(flags, order);
+ if (!l)
+ return 0;
+
+ if (t->count) {
+ memcpy(l, t->loc, sizeof(struct location) * t->count);
+ free_loc_track(t);
+ }
+ t->max = max;
+ t->loc = l;
+ return 1;
+}
+
+static int add_location(struct loc_track *t, struct kmem_cache *s,
+ const struct track *track)
+{
+ long start, end, pos;
+ struct location *l;
+ unsigned long caddr;
+ unsigned long age = jiffies - track->when;
+
+ start = -1;
+ end = t->count;
+
+ for ( ; ; ) {
+ pos = start + (end - start + 1) / 2;
+
+ /*
+ * There is nothing at "end". If we end up there
+ * we need to add something to before end.
+ */
+ if (pos == end)
+ break;
+
+ caddr = t->loc[pos].addr;
+ if (track->addr == caddr) {
+
+ l = &t->loc[pos];
+ l->count++;
+ if (track->when) {
+ l->sum_time += age;
+ if (age < l->min_time)
+ l->min_time = age;
+ if (age > l->max_time)
+ l->max_time = age;
+
+ if (track->pid < l->min_pid)
+ l->min_pid = track->pid;
+ if (track->pid > l->max_pid)
+ l->max_pid = track->pid;
+
+ cpumask_set_cpu(track->cpu,
+ to_cpumask(l->cpus));
+ }
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+ }
+
+ if (track->addr < caddr)
+ end = pos;
+ else
+ start = pos;
+ }
+
+ /*
+ * Not found. Insert new tracking element.
+ */
+ if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
+ return 0;
+
+ l = t->loc + pos;
+ if (pos < t->count)
+ memmove(l + 1, l,
+ (t->count - pos) * sizeof(struct location));
+ t->count++;
+ l->count = 1;
+ l->addr = track->addr;
+ l->sum_time = age;
+ l->min_time = age;
+ l->max_time = age;
+ l->min_pid = track->pid;
+ l->max_pid = track->pid;
+ cpumask_clear(to_cpumask(l->cpus));
+ cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
+ nodes_clear(l->nodes);
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+}
+
+static void process_slab(struct loc_track *t, struct kmem_cache *s,
+ struct page *page, enum track_item alloc)
+{
+ void *addr = page_address(page);
+ void *p;
+ unsigned long *map;
+
+ map = get_map(s, page);
+ for_each_object(p, s, addr, page->objects)
+ if (!test_bit(__obj_to_index(s, addr, p), map))
+ add_location(t, s, get_track(s, p, alloc));
+ put_map(map);
+}
+
+static int list_locations(struct kmem_cache *s, char *buf,
+ enum track_item alloc)
+{
+ int len = 0;
+ unsigned long i;
+ struct loc_track t = { 0, 0, NULL };
+ int node;
+ struct kmem_cache_node *n;
+
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_KERNEL)) {
+ return sprintf(buf, "Out of memory\n");
+ }
+ /* Push back cpu slabs */
+ flush_all(s);
+
+ for_each_kmem_cache_node(s, node, n) {
+ unsigned long flags;
+ struct page *page;
+
+ if (!atomic_long_read(&n->nr_slabs))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, slab_list)
+ process_slab(&t, s, page, alloc);
+ list_for_each_entry(page, &n->full, slab_list)
+ process_slab(&t, s, page, alloc);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ for (i = 0; i < t.count; i++) {
+ struct location *l = &t.loc[i];
+
+ if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
+ break;
+ len += sprintf(buf + len, "%7ld ", l->count);
+
+ if (l->addr)
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
+ else
+ len += sprintf(buf + len, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ len += sprintf(buf + len, " age=%ld/%ld/%ld",
+ l->min_time,
+ (long)div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ len += sprintf(buf + len, " age=%ld",
+ l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ len += sprintf(buf + len, " pid=%ld-%ld",
+ l->min_pid, l->max_pid);
+ else
+ len += sprintf(buf + len, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 &&
+ !cpumask_empty(to_cpumask(l->cpus)) &&
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
+
+ len += sprintf(buf + len, "\n");
+ }
+
+ free_loc_track(&t);
+ if (!t.count)
+ len += sprintf(buf, "No data\n");
+ return len;
+}
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef SLUB_RESILIENCY_TEST
+static void __init resiliency_test(void)
+{
+ u8 *p;
+ int type = KMALLOC_NORMAL;
+
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
+
+ pr_err("SLUB resiliency testing\n");
+ pr_err("-----------------------\n");
+ pr_err("A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
+ p + 16);
+
+ validate_slab_cache(kmalloc_caches[type][4]);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches[type][5]);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches[type][6]);
+
+ pr_err("\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[type][7]);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[type][8]);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[type][9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif /* SLUB_RESILIENCY_TEST */
+
+#ifdef CONFIG_SYSFS
+enum slab_stat_type {
+ SL_ALL, /* All slabs */
+ SL_PARTIAL, /* Only partially allocated slabs */
+ SL_CPU, /* Only slabs used for cpu caches */
+ SL_OBJECTS, /* Determine allocated objects not slabs */
+ SL_TOTAL /* Determine object capacity not slabs */
+};
+
+#define SO_ALL (1 << SL_ALL)
+#define SO_PARTIAL (1 << SL_PARTIAL)
+#define SO_CPU (1 << SL_CPU)
+#define SO_OBJECTS (1 << SL_OBJECTS)
+#define SO_TOTAL (1 << SL_TOTAL)
+
+#ifdef CONFIG_MEMCG
+static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
+
+static int __init setup_slub_memcg_sysfs(char *str)
+{
+ int v;
+
+ if (get_option(&str, &v) > 0)
+ memcg_sysfs_enabled = v;
+
+ return 1;
+}
+
+__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
+#endif
+
+static ssize_t show_slab_objects(struct kmem_cache *s,
+ char *buf, unsigned long flags)
+{
+ unsigned long total = 0;
+ int node;
+ int x;
+ unsigned long *nodes;
+
+ nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+ if (!nodes)
+ return -ENOMEM;
+
+ if (flags & SO_CPU) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
+ cpu);
+ int node;
+ struct page *page;
+
+ page = READ_ONCE(c->page);
+ if (!page)
+ continue;
+
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ x = page->objects;
+ else if (flags & SO_OBJECTS)
+ x = page->inuse;
+ else
+ x = 1;
+
+ total += x;
+ nodes[node] += x;
+
+ page = slub_percpu_partial_read_once(c);
+ if (page) {
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ WARN_ON_ONCE(1);
+ else if (flags & SO_OBJECTS)
+ WARN_ON_ONCE(1);
+ else
+ x = page->pages;
+ total += x;
+ nodes[node] += x;
+ }
+ }
+ }
+
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SO_ALL) {
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n) {
+
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
+ else
+ x = atomic_long_read(&n->nr_slabs);
+ total += x;
+ nodes[node] += x;
+ }
+
+ } else
+#endif
+ if (flags & SO_PARTIAL) {
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n) {
+ if (flags & SO_TOTAL)
+ x = count_partial(n, count_total);
+ else if (flags & SO_OBJECTS)
+ x = count_partial(n, count_inuse);
+ else
+ x = n->nr_partial;
+ total += x;
+ nodes[node] += x;
+ }
+ }
+ x = sprintf(buf, "%lu", total);
+#ifdef CONFIG_NUMA
+ for (node = 0; node < nr_node_ids; node++)
+ if (nodes[node])
+ x += sprintf(buf + x, " N%d=%lu",
+ node, nodes[node]);
+#endif
+ kfree(nodes);
+ return x + sprintf(buf + x, "\n");
+}
+
+#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
+#define to_slab(n) container_of(n, struct kmem_cache, kobj)
+
+struct slab_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kmem_cache *s, char *buf);
+ ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
+};
+
+#define SLAB_ATTR_RO(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0400, _name##_show, NULL)
+
+#define SLAB_ATTR(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0600, _name##_show, _name##_store)
+
+static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->size);
+}
+SLAB_ATTR_RO(slab_size);
+
+static ssize_t align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->align);
+}
+SLAB_ATTR_RO(align);
+
+static ssize_t object_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->object_size);
+}
+SLAB_ATTR_RO(object_size);
+
+static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", oo_objects(s->oo));
+}
+SLAB_ATTR_RO(objs_per_slab);
+
+static ssize_t order_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", oo_order(s->oo));
+}
+SLAB_ATTR_RO(order);
+
+static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%lu\n", s->min_partial);
+}
+
+static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 10, &min);
+ if (err)
+ return err;
+
+ set_min_partial(s, min);
+ return length;
+}
+SLAB_ATTR(min_partial);
+
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", slub_cpu_partial(s));
+}
+
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned int objects;
+ int err;
+
+ err = kstrtouint(buf, 10, &objects);
+ if (err)
+ return err;
+ if (objects && !kmem_cache_has_cpu_partial(s))
+ return -EINVAL;
+
+ slub_set_cpu_partial(s, objects);
+ flush_all(s);
+ return length;
+}
+SLAB_ATTR(cpu_partial);
+
+static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+{
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
+}
+SLAB_ATTR_RO(ctor);
+
+static ssize_t aliases_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
+}
+SLAB_ATTR_RO(aliases);
+
+static ssize_t partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL);
+}
+SLAB_ATTR_RO(partial);
+
+static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_CPU);
+}
+SLAB_ATTR_RO(cpu_slabs);
+
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects);
+
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ int objects = 0;
+ int pages = 0;
+ int cpu;
+ int len;
+
+ for_each_online_cpu(cpu) {
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+
+ if (page) {
+ pages += page->pages;
+ objects += page->pobjects;
+ }
+ }
+
+ len = sprintf(buf, "%d(%d)", objects, pages);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct page *page;
+
+ page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+
+ if (page && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+ page->pobjects, page->pages);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
+
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+SLAB_ATTR_RO(reclaim_account);
+
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+
+static ssize_t usersize_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->usersize);
+}
+SLAB_ATTR_RO(usersize);
+
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL);
+}
+SLAB_ATTR_RO(slabs);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
+}
+SLAB_ATTR_RO(sanity_checks);
+
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+}
+SLAB_ATTR_RO(trace);
+
+static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+}
+
+SLAB_ATTR_RO(red_zone);
+
+static ssize_t poison_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+}
+
+SLAB_ATTR_RO(poison);
+
+static ssize_t store_user_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+}
+
+SLAB_ATTR_RO(store_user);
+
+static ssize_t validate_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t validate_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = -EINVAL;
+
+ if (buf[0] == '1') {
+ ret = validate_slab_cache(s);
+ if (ret >= 0)
+ ret = length;
+ }
+ return ret;
+}
+SLAB_ATTR(validate);
+
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+SLAB_ATTR_RO(failslab);
+#endif
+
+static ssize_t shrink_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t shrink_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
+ return -EINVAL;
+ return length;
+}
+SLAB_ATTR(shrink);
+
+#ifdef CONFIG_NUMA
+static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
+}
+
+static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned int ratio;
+ int err;
+
+ err = kstrtouint(buf, 10, &ratio);
+ if (err)
+ return err;
+ if (ratio > 100)
+ return -ERANGE;
+
+ s->remote_node_defrag_ratio = ratio * 10;
+
+ return length;
+}
+SLAB_ATTR(remote_node_defrag_ratio);
+#endif
+
+#ifdef CONFIG_SLUB_STATS
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+ unsigned long sum = 0;
+ int cpu;
+ int len;
+ int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) {
+ unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
+
+ data[cpu] = x;
+ sum += x;
+ }
+
+ len = sprintf(buf, "%lu", sum);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ if (data[cpu] && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
+ }
+#endif
+ kfree(data);
+ return len + sprintf(buf + len, "\n");
+}
+
+static void clear_stat(struct kmem_cache *s, enum stat_item si)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
+}
+
+#define STAT_ATTR(si, text) \
+static ssize_t text##_show(struct kmem_cache *s, char *buf) \
+{ \
+ return show_stat(s, buf, si); \
+} \
+static ssize_t text##_store(struct kmem_cache *s, \
+ const char *buf, size_t length) \
+{ \
+ if (buf[0] != '0') \
+ return -EINVAL; \
+ clear_stat(s, si); \
+ return length; \
+} \
+SLAB_ATTR(text); \
+
+STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
+STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_FASTPATH, free_fastpath);
+STAT_ATTR(FREE_SLOWPATH, free_slowpath);
+STAT_ATTR(FREE_FROZEN, free_frozen);
+STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
+STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
+STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
+STAT_ATTR(ALLOC_SLAB, alloc_slab);
+STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
+STAT_ATTR(FREE_SLAB, free_slab);
+STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
+STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
+STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
+STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
+STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
+STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
+STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
+STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
+STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
+#endif /* CONFIG_SLUB_STATS */
+
+static struct attribute *slab_attrs[] = {
+ &slab_size_attr.attr,
+ &object_size_attr.attr,
+ &objs_per_slab_attr.attr,
+ &order_attr.attr,
+ &min_partial_attr.attr,
+ &cpu_partial_attr.attr,
+ &objects_attr.attr,
+ &objects_partial_attr.attr,
+ &partial_attr.attr,
+ &cpu_slabs_attr.attr,
+ &ctor_attr.attr,
+ &aliases_attr.attr,
+ &align_attr.attr,
+ &hwcache_align_attr.attr,
+ &reclaim_account_attr.attr,
+ &destroy_by_rcu_attr.attr,
+ &shrink_attr.attr,
+ &slabs_cpu_partial_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
+ &red_zone_attr.attr,
+ &poison_attr.attr,
+ &store_user_attr.attr,
+ &validate_attr.attr,
+ &alloc_calls_attr.attr,
+ &free_calls_attr.attr,
+#endif
+#ifdef CONFIG_ZONE_DMA
+ &cache_dma_attr.attr,
+#endif
+#ifdef CONFIG_NUMA
+ &remote_node_defrag_ratio_attr.attr,
+#endif
+#ifdef CONFIG_SLUB_STATS
+ &alloc_fastpath_attr.attr,
+ &alloc_slowpath_attr.attr,
+ &free_fastpath_attr.attr,
+ &free_slowpath_attr.attr,
+ &free_frozen_attr.attr,
+ &free_add_partial_attr.attr,
+ &free_remove_partial_attr.attr,
+ &alloc_from_partial_attr.attr,
+ &alloc_slab_attr.attr,
+ &alloc_refill_attr.attr,
+ &alloc_node_mismatch_attr.attr,
+ &free_slab_attr.attr,
+ &cpuslab_flush_attr.attr,
+ &deactivate_full_attr.attr,
+ &deactivate_empty_attr.attr,
+ &deactivate_to_head_attr.attr,
+ &deactivate_to_tail_attr.attr,
+ &deactivate_remote_frees_attr.attr,
+ &deactivate_bypass_attr.attr,
+ &order_fallback_attr.attr,
+ &cmpxchg_double_fail_attr.attr,
+ &cmpxchg_double_cpu_fail_attr.attr,
+ &cpu_partial_alloc_attr.attr,
+ &cpu_partial_free_attr.attr,
+ &cpu_partial_node_attr.attr,
+ &cpu_partial_drain_attr.attr,
+#endif
+#ifdef CONFIG_FAILSLAB
+ &failslab_attr.attr,
+#endif
+ &usersize_attr.attr,
+
+ NULL
+};
+
+static const struct attribute_group slab_attr_group = {
+ .attrs = slab_attrs,
+};
+
+static ssize_t slab_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ err = attribute->show(s, buf);
+
+ return err;
+}
+
+static ssize_t slab_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ err = attribute->store(s, buf, len);
+ return err;
+}
+
+static void kmem_cache_release(struct kobject *k)
+{
+ slab_kmem_cache_release(to_slab(k));
+}
+
+static const struct sysfs_ops slab_sysfs_ops = {
+ .show = slab_attr_show,
+ .store = slab_attr_store,
+};
+
+static struct kobj_type slab_ktype = {
+ .sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release,
+};
+
+static struct kset *slab_kset;
+
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+ return slab_kset;
+}
+
+#define ID_STR_LENGTH 64
+
+/* Create a unique string id for a slab cache:
+ *
+ * Format :[flags-]size
+ */
+static char *create_unique_id(struct kmem_cache *s)
+{
+ char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
+ char *p = name;
+
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+
+ *p++ = ':';
+ /*
+ * First flags affecting slabcache operations. We will only
+ * get here for aliasable slabs so we do not need to support
+ * too many flags. The flags here must cover all flags that
+ * are matched during merging to guarantee that the id is
+ * unique.
+ */
+ if (s->flags & SLAB_CACHE_DMA)
+ *p++ = 'd';
+ if (s->flags & SLAB_CACHE_DMA32)
+ *p++ = 'D';
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ *p++ = 'a';
+ if (s->flags & SLAB_CONSISTENCY_CHECKS)
+ *p++ = 'F';
+ if (s->flags & SLAB_ACCOUNT)
+ *p++ = 'A';
+ if (p != name + 1)
+ *p++ = '-';
+ p += sprintf(p, "%07u", s->size);
+
+ BUG_ON(p > name + ID_STR_LENGTH - 1);
+ return name;
+}
+
+static int sysfs_slab_add(struct kmem_cache *s)
+{
+ int err;
+ const char *name;
+ struct kset *kset = cache_kset(s);
+ int unmergeable = slab_unmergeable(s);
+
+ if (!kset) {
+ kobject_init(&s->kobj, &slab_ktype);
+ return 0;
+ }
+
+ if (!unmergeable && disable_higher_order_debug &&
+ (slub_debug & DEBUG_METADATA_FLAGS))
+ unmergeable = 1;
+
+ if (unmergeable) {
+ /*
+ * Slabcache can never be merged so we can use the name proper.
+ * This is typically the case for debug situations. In that
+ * case we can catch duplicate names easily.
+ */
+ sysfs_remove_link(&slab_kset->kobj, s->name);
+ name = s->name;
+ } else {
+ /*
+ * Create a unique name for the slab as a target
+ * for the symlinks.
+ */
+ name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+ }
+
+ s->kobj.kset = kset;
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
+ if (err)
+ goto out;
+
+ err = sysfs_create_group(&s->kobj, &slab_attr_group);
+ if (err)
+ goto out_del_kobj;
+
+ if (!unmergeable) {
+ /* Setup first alias */
+ sysfs_slab_alias(s, s->name);
+ }
+out:
+ if (!unmergeable)
+ kfree(name);
+ return err;
+out_del_kobj:
+ kobject_del(&s->kobj);
+ goto out;
+}
+
+void sysfs_slab_unlink(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_del(&s->kobj);
+}
+
+void sysfs_slab_release(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_put(&s->kobj);
+}
+
+/*
+ * Need to buffer aliases during bootup until sysfs becomes
+ * available lest we lose that information.
+ */
+struct saved_alias {
+ struct kmem_cache *s;
+ const char *name;
+ struct saved_alias *next;
+};
+
+static struct saved_alias *alias_list;
+
+static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
+{
+ struct saved_alias *al;
+
+ if (slab_state == FULL) {
+ /*
+ * If we have a leftover link then remove it.
+ */
+ sysfs_remove_link(&slab_kset->kobj, name);
+ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
+ }
+
+ al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
+ if (!al)
+ return -ENOMEM;
+
+ al->s = s;
+ al->name = name;
+ al->next = alias_list;
+ alias_list = al;
+ return 0;
+}
+
+static int __init slab_sysfs_init(void)
+{
+ struct kmem_cache *s;
+ int err;
+
+ mutex_lock(&slab_mutex);
+
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
+ if (!slab_kset) {
+ mutex_unlock(&slab_mutex);
+ pr_err("Cannot register slab subsystem.\n");
+ return -ENOSYS;
+ }
+
+ slab_state = FULL;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ err = sysfs_slab_add(s);
+ if (err)
+ pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
+ s->name);
+ }
+
+ while (alias_list) {
+ struct saved_alias *al = alias_list;
+
+ alias_list = alias_list->next;
+ err = sysfs_slab_alias(al->s, al->name);
+ if (err)
+ pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
+ al->name);
+ kfree(al);
+ }
+
+ mutex_unlock(&slab_mutex);
+ resiliency_test();
+ return 0;
+}
+
+__initcall(slab_sysfs_init);
+#endif /* CONFIG_SYSFS */
+
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLUB_DEBUG
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
+{
+ unsigned long nr_slabs = 0;
+ unsigned long nr_objs = 0;
+ unsigned long nr_free = 0;
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(s, node, n) {
+ nr_slabs += node_nr_slabs(n);
+ nr_objs += node_nr_objs(n);
+ nr_free += count_partial(n, count_free);
+ }
+
+ sinfo->active_objs = nr_objs - nr_free;
+ sinfo->num_objs = nr_objs;
+ sinfo->active_slabs = nr_slabs;
+ sinfo->num_slabs = nr_slabs;
+ sinfo->objects_per_slab = oo_objects(s->oo);
+ sinfo->cache_order = oo_order(s->oo);
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
+{
+}
+
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return -EIO;
+}
+#endif /* CONFIG_SLUB_DEBUG */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 000000000..16183d85a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * The architecture is expected to provide a vmemmap_populate() function
+ * to instantiate the mapping.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/memblock.h>
+#include <linux/memremap.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+
+static void * __ref __earlyonly_bootmem_alloc(int node,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return memblock_alloc_try_nid_raw(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE, node);
+}
+
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+ /* If the main allocator is up use that, fallback to bootmem. */
+ if (slab_is_available()) {
+ gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ int order = get_order(size);
+ static bool warned;
+ struct page *page;
+
+ page = alloc_pages_node(node, gfp_mask, order);
+ if (page)
+ return page_address(page);
+
+ if (!warned) {
+ warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
+ "vmemmap alloc failure: order:%u", order);
+ warned = true;
+ }
+ return NULL;
+ } else
+ return __earlyonly_bootmem_alloc(node, size, size,
+ __pa(MAX_DMA_ADDRESS));
+}
+
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap);
+
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
+{
+ void *ptr;
+
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+
+ ptr = sparse_buffer_alloc(size);
+ if (!ptr)
+ ptr = vmemmap_alloc_block(size, node);
+ return ptr;
+}
+
+static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+ return altmap->base_pfn + altmap->reserve + altmap->alloc
+ + altmap->align;
+}
+
+static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+ unsigned long allocated = altmap->alloc + altmap->align;
+
+ if (altmap->free > allocated)
+ return altmap->free - allocated;
+ return 0;
+}
+
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pfn, nr_pfns, nr_align;
+
+ if (size & ~PAGE_MASK) {
+ pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+ __func__, size);
+ return NULL;
+ }
+
+ pfn = vmem_altmap_next_pfn(altmap);
+ nr_pfns = size >> PAGE_SHIFT;
+ nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+ nr_align = ALIGN(pfn, nr_align) - pfn;
+ if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+ return NULL;
+
+ altmap->alloc += nr_pfns;
+ altmap->align += nr_align;
+ pfn += nr_align;
+
+ pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+ __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+ return __va(__pfn_to_phys(pfn));
+}
+
+void __meminit vmemmap_verify(pte_t *pte, int node,
+ unsigned long start, unsigned long end)
+{
+ unsigned long pfn = pte_pfn(*pte);
+ int actual_node = early_pfn_to_nid(pfn);
+
+ if (node_distance(actual_node, node) > LOCAL_DISTANCE)
+ pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ start, end - 1);
+}
+
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+ struct vmem_altmap *altmap)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p)
+ return NULL;
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+ return pte;
+}
+
+static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
+{
+ void *p = vmemmap_alloc_block(size, node);
+
+ if (!p)
+ return NULL;
+ memset(p, 0, size);
+
+ return p;
+}
+
+pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+ return pmd;
+}
+
+pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
+{
+ pud_t *pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pud_populate(&init_mm, pud, p);
+ }
+ return pud;
+}
+
+p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ p4d_populate(&init_mm, p4d, p);
+ }
+ return p4d;
+}
+
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pgd_populate(&init_mm, pgd, p);
+ }
+ return pgd;
+}
+
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ unsigned long addr = start;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+ if (!pmd)
+ return -ENOMEM;
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+{
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
+ !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
+ return NULL;
+
+ if (vmemmap_populate(start, end, nid, altmap))
+ return NULL;
+
+ return pfn_to_page(pfn);
+}
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000..33406ea2e
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,974 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sparse memory mappings.
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/mmzone.h>
+#include <linux/memblock.h>
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+#include <asm/dma.h>
+
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section - memory sections, mem_map's for valid memory
+ */
+#ifdef CONFIG_SPARSEMEM_EXTREME
+struct mem_section **mem_section;
+#else
+struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
+ ____cacheline_internodealigned_in_smp;
+#endif
+EXPORT_SYMBOL(mem_section);
+
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+int page_to_nid(const struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+
+static void set_section_nid(unsigned long section_nr, int nid)
+{
+ section_to_node_table[section_nr] = nid;
+}
+#else /* !NODE_NOT_IN_PAGE_FLAGS */
+static inline void set_section_nid(unsigned long section_nr, int nid)
+{
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_EXTREME
+static noinline struct mem_section __ref *sparse_index_alloc(int nid)
+{
+ struct mem_section *section = NULL;
+ unsigned long array_size = SECTIONS_PER_ROOT *
+ sizeof(struct mem_section);
+
+ if (slab_is_available()) {
+ section = kzalloc_node(array_size, GFP_KERNEL, nid);
+ } else {
+ section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
+ nid);
+ if (!section)
+ panic("%s: Failed to allocate %lu bytes nid=%d\n",
+ __func__, array_size, nid);
+ }
+
+ return section;
+}
+
+static int __meminit sparse_index_init(unsigned long section_nr, int nid)
+{
+ unsigned long root = SECTION_NR_TO_ROOT(section_nr);
+ struct mem_section *section;
+
+ /*
+ * An existing section is possible in the sub-section hotplug
+ * case. First hot-add instantiates, follow-on hot-add reuses
+ * the existing section.
+ *
+ * The mem_hotplug_lock resolves the apparent race below.
+ */
+ if (mem_section[root])
+ return 0;
+
+ section = sparse_index_alloc(nid);
+ if (!section)
+ return -ENOMEM;
+
+ mem_section[root] = section;
+
+ return 0;
+}
+#else /* !SPARSEMEM_EXTREME */
+static inline int sparse_index_init(unsigned long section_nr, int nid)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_EXTREME
+unsigned long __section_nr(struct mem_section *ms)
+{
+ unsigned long root_nr;
+ struct mem_section *root = NULL;
+
+ for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
+ root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
+ if (!root)
+ continue;
+
+ if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+ break;
+ }
+
+ VM_BUG_ON(!root);
+
+ return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+#else
+unsigned long __section_nr(struct mem_section *ms)
+{
+ return (unsigned long)(ms - mem_section[0]);
+}
+#endif
+
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node. This keeps us from having to use another data structure. The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+ return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+ return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
+/* Validate the physical addressing limitations of the model */
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+
+ /*
+ * Sanity checks - do not allow an architecture to pass
+ * in larger pfns than the maximum scope of sparsemem:
+ */
+ if (*start_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *start_pfn = max_sparsemem_pfn;
+ *end_pfn = max_sparsemem_pfn;
+ } else if (*end_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *end_pfn = max_sparsemem_pfn;
+ }
+}
+
+/*
+ * There are a number of times that we loop over NR_MEM_SECTIONS,
+ * looking for section_present() on each. But, when we have very
+ * large physical address spaces, NR_MEM_SECTIONS can also be
+ * very large which makes the loops quite long.
+ *
+ * Keeping track of this gives us an easy way to break out of
+ * those loops early.
+ */
+unsigned long __highest_present_section_nr;
+static void section_mark_present(struct mem_section *ms)
+{
+ unsigned long section_nr = __section_nr(ms);
+
+ if (section_nr > __highest_present_section_nr)
+ __highest_present_section_nr = section_nr;
+
+ ms->section_mem_map |= SECTION_MARKED_PRESENT;
+}
+
+#define for_each_present_section_nr(start, section_nr) \
+ for (section_nr = next_present_section_nr(start-1); \
+ ((section_nr != -1) && \
+ (section_nr <= __highest_present_section_nr)); \
+ section_nr = next_present_section_nr(section_nr))
+
+static inline unsigned long first_present_section_nr(void)
+{
+ return next_present_section_nr(-1);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ int idx = subsection_map_index(pfn);
+ int end = subsection_map_index(pfn + nr_pages - 1);
+
+ bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+ int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ unsigned long nr, start_sec = pfn_to_section_nr(pfn);
+
+ if (!nr_pages)
+ return;
+
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ struct mem_section *ms;
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ ms = __nr_to_section(nr);
+ subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+ pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
+ pfns, subsection_map_index(pfn),
+ subsection_map_index(pfn + pfns - 1));
+
+ pfn += pfns;
+ nr_pages -= pfns;
+ }
+}
+#else
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+}
+#endif
+
+/* Record a memory area against a node. */
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (unlikely(!mem_section)) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_alloc(size, align);
+ if (!mem_section)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, size, align);
+ }
+#endif
+
+ start &= PAGE_SECTION_MASK;
+ mminit_validate_memmodel_limits(&start, &end);
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ unsigned long section = pfn_to_section_nr(pfn);
+ struct mem_section *ms;
+
+ sparse_index_init(section, nid);
+ set_section_nid(section, nid);
+
+ ms = __nr_to_section(section);
+ if (!ms->section_mem_map) {
+ ms->section_mem_map = sparse_encode_early_nid(nid) |
+ SECTION_IS_ONLINE;
+ section_mark_present(ms);
+ }
+ }
+}
+
+/*
+ * Mark all memblocks as present using memory_present().
+ * This is a convenience function that is useful to mark all of the systems
+ * memory as present during initialization.
+ */
+static void __init memblocks_present(void)
+{
+ unsigned long start, end;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
+ memory_present(nid, start, end);
+}
+
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+ unsigned long coded_mem_map =
+ (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
+ return coded_mem_map;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Decode mem_map from the coded memmap
+ */
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+ /* mask off the extra low bits of information */
+ coded_mem_map &= SECTION_MAP_MASK;
+ return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static void __meminit sparse_init_one_section(struct mem_section *ms,
+ unsigned long pnum, struct page *mem_map,
+ struct mem_section_usage *usage, unsigned long flags)
+{
+ ms->section_mem_map &= ~SECTION_MAP_MASK;
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+ | SECTION_HAS_MEM_MAP | flags;
+ ms->usage = usage;
+}
+
+static unsigned long usemap_size(void)
+{
+ return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
+}
+
+size_t mem_section_usage_size(void)
+{
+ return sizeof(struct mem_section_usage) + usemap_size();
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct mem_section_usage * __init
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long size)
+{
+ struct mem_section_usage *usage;
+ unsigned long goal, limit;
+ int nid;
+ /*
+ * A page may contain usemaps for other sections preventing the
+ * page being freed and making a section unremovable while
+ * other sections referencing the usemap remain active. Similarly,
+ * a pgdat can prevent a section being removed. If section A
+ * contains a pgdat and section B contains the usemap, both
+ * sections become inter-dependent. This allocates usemaps
+ * from the same section as the pgdat where possible to avoid
+ * this problem.
+ */
+ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ limit = goal + (1UL << PA_SECTION_SHIFT);
+ nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+ usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+ if (!usage && limit) {
+ limit = 0;
+ goto again;
+ }
+ return usage;
+}
+
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
+{
+ unsigned long usemap_snr, pgdat_snr;
+ static unsigned long old_usemap_snr;
+ static unsigned long old_pgdat_snr;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int usemap_nid;
+
+ /* First call */
+ if (!old_usemap_snr) {
+ old_usemap_snr = NR_MEM_SECTIONS;
+ old_pgdat_snr = NR_MEM_SECTIONS;
+ }
+
+ usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ if (usemap_snr == pgdat_snr)
+ return;
+
+ if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+ /* skip redundant message */
+ return;
+
+ old_usemap_snr = usemap_snr;
+ old_pgdat_snr = pgdat_snr;
+
+ usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+ if (usemap_nid != nid) {
+ pr_info("node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
+ return;
+ }
+ /*
+ * There is a circular dependency.
+ * Some platforms allow un-removable section because they will just
+ * gather other removable sections for dynamic partitioning.
+ * Just notify un-removable section's number here.
+ */
+ pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
+ usemap_snr, pgdat_snr, nid);
+}
+#else
+static struct mem_section_usage * __init
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long size)
+{
+ return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
+}
+
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static unsigned long __init section_map_size(void)
+{
+ return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
+}
+
+#else
+static unsigned long __init section_map_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+}
+
+struct page __init *__populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+{
+ unsigned long size = section_map_size();
+ struct page *map = sparse_buffer_alloc(size);
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
+
+ if (map)
+ return map;
+
+ map = memblock_alloc_try_nid_raw(size, size, addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!map)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
+ __func__, size, PAGE_SIZE, nid, &addr);
+
+ return map;
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+static void *sparsemap_buf __meminitdata;
+static void *sparsemap_buf_end __meminitdata;
+
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+ WARN_ON(!sparsemap_buf || size == 0);
+ memblock_free_early(__pa(sparsemap_buf), size);
+}
+
+static void __init sparse_buffer_init(unsigned long size, int nid)
+{
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
+ WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
+ /*
+ * Pre-allocated buffer is mainly used by __populate_section_memmap
+ * and we want it to be properly aligned to the section size - this is
+ * especially the case for VMEMMAP which maps memmap to PMDs
+ */
+ sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
+ addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf_end = sparsemap_buf + size;
+}
+
+static void __init sparse_buffer_fini(void)
+{
+ unsigned long size = sparsemap_buf_end - sparsemap_buf;
+
+ if (sparsemap_buf && size > 0)
+ sparse_buffer_free(size);
+ sparsemap_buf = NULL;
+}
+
+void * __meminit sparse_buffer_alloc(unsigned long size)
+{
+ void *ptr = NULL;
+
+ if (sparsemap_buf) {
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
+ if (ptr + size > sparsemap_buf_end)
+ ptr = NULL;
+ else {
+ /* Free redundant aligned space */
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
+ sparsemap_buf = ptr + size;
+ }
+ }
+ return ptr;
+}
+
+void __weak __meminit vmemmap_populate_print_last(void)
+{
+}
+
+/*
+ * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
+ * And number of present sections in this node is map_count.
+ */
+static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count)
+{
+ struct mem_section_usage *usage;
+ unsigned long pnum;
+ struct page *map;
+
+ usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ mem_section_usage_size() * map_count);
+ if (!usage) {
+ pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
+ goto failed;
+ }
+ sparse_buffer_init(map_count * section_map_size(), nid);
+ for_each_present_section_nr(pnum_begin, pnum) {
+ unsigned long pfn = section_nr_to_pfn(pnum);
+
+ if (pnum >= pnum_end)
+ break;
+
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ sparse_buffer_fini();
+ goto failed;
+ }
+ check_usemap_section_nr(nid, usage);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+ SECTION_IS_EARLY);
+ usage = (void *) usage + mem_section_usage_size();
+ }
+ sparse_buffer_fini();
+ return;
+failed:
+ /* We failed to allocate, mark all the following pnums as not present */
+ for_each_present_section_nr(pnum_begin, pnum) {
+ struct mem_section *ms;
+
+ if (pnum >= pnum_end)
+ break;
+ ms = __nr_to_section(pnum);
+ ms->section_mem_map = 0;
+ }
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+ unsigned long pnum_end, pnum_begin, map_count = 1;
+ int nid_begin;
+
+ memblocks_present();
+
+ pnum_begin = first_present_section_nr();
+ nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
+
+ /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+ set_pageblock_order();
+
+ for_each_present_section_nr(pnum_begin + 1, pnum_end) {
+ int nid = sparse_early_nid(__nr_to_section(pnum_end));
+
+ if (nid == nid_begin) {
+ map_count++;
+ continue;
+ }
+ /* Init node with sections in range [pnum_begin, pnum_end) */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ nid_begin = nid;
+ pnum_begin = pnum_end;
+ map_count = 1;
+ }
+ /* cover the last node */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ vmemmap_populate_print_last();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+/* Mark all memory sections within the pfn range as online */
+void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+ struct mem_section *ms;
+
+ /* onlining code should never touch invalid ranges */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map |= SECTION_IS_ONLINE;
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/* Mark all memory sections within the pfn range as offline */
+void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+ struct mem_section *ms;
+
+ /*
+ * TODO this needs some double checking. Offlining code makes
+ * sure to check pfn_valid but those checks might be just bogus
+ */
+ if (WARN_ON(!valid_section_nr(section_nr)))
+ continue;
+
+ ms = __nr_to_section(section_nr);
+ ms->section_mem_map &= ~SECTION_IS_ONLINE;
+ }
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+{
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap);
+}
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
+
+ vmemmap_free(start, end, altmap);
+}
+static void free_map_bootmem(struct page *memmap)
+{
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+
+ vmemmap_free(start, end, NULL);
+}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return -EINVAL;
+
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return bitmap_empty(&ms->usage->subsection_map[0],
+ SUBSECTIONS_PER_SECTION);
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ unsigned long *subsection_map;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ return rc;
+}
+#else
+struct page * __meminit populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+{
+ return kvmalloc_node(array_size(sizeof(struct page),
+ PAGES_PER_SECTION), GFP_KERNEL, nid);
+}
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ kvfree(pfn_to_page(pfn));
+}
+
+static void free_map_bootmem(struct page *memmap)
+{
+ unsigned long maps_section_nr, removing_section_nr, i;
+ unsigned long magic, nr_pages;
+ struct page *page = virt_to_page(memmap);
+
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ for (i = 0; i < nr_pages; i++, page++) {
+ magic = (unsigned long) page->freelist;
+
+ BUG_ON(magic == NODE_INFO);
+
+ maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+ removing_section_nr = page_private(page);
+
+ /*
+ * When this function is called, the removing section is
+ * logical offlined state. This means all pages are isolated
+ * from page allocator. If removing section's memmap is placed
+ * on the same section, it must not be freed.
+ * If it is freed, page allocator may allocate it which will
+ * be removed physically soon.
+ */
+ if (maps_section_nr != removing_section_nr)
+ put_page_bootmem(page);
+ }
+}
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+ return true;
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+ return 0;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+/*
+ * To deactivate a memory region, there are 3 cases to handle across
+ * two configurations (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1. deactivation of a partial hot-added section (only possible in
+ * the SPARSEMEM_VMEMMAP=y case).
+ * a) section was present at memory init.
+ * b) section was hot-added post memory init.
+ * 2. deactivation of a complete hot-added section.
+ * 3. deactivation of a complete section from memory init.
+ *
+ * For 1, when subsection_map does not empty we will not be freeing the
+ * usage map, but still need to free the vmemmap range.
+ *
+ * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+ bool section_is_early = early_section(ms);
+ struct page *memmap = NULL;
+ bool empty;
+
+ if (clear_subsection_map(pfn, nr_pages))
+ return;
+
+ empty = is_subsection_map_empty(ms);
+ if (empty) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ /*
+ * When removing an early section, the usage map is kept (as the
+ * usage maps of other sections fall into the same page). It
+ * will be re-used when re-adding the section - which is then no
+ * longer an early section. If the usage map is PageReserved, it
+ * was allocated during boot.
+ */
+ if (!PageReserved(virt_to_page(ms->usage))) {
+ kfree(ms->usage);
+ ms->usage = NULL;
+ }
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+ /*
+ * Mark the section invalid so that valid_section()
+ * return false. This prevents code from dereferencing
+ * ms->usage array.
+ */
+ ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
+ }
+
+ /*
+ * The memmap of early sections is always fully populated. See
+ * section_activate() and pfn_valid() .
+ */
+ if (!section_is_early)
+ depopulate_section_memmap(pfn, nr_pages, altmap);
+ else if (memmap)
+ free_map_bootmem(memmap);
+
+ if (empty)
+ ms->section_mem_map = (unsigned long)NULL;
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+ struct mem_section_usage *usage = NULL;
+ struct page *memmap;
+ int rc = 0;
+
+ if (!ms->usage) {
+ usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+ if (!usage)
+ return ERR_PTR(-ENOMEM);
+ ms->usage = usage;
+ }
+
+ rc = fill_subsection_map(pfn, nr_pages);
+ if (rc) {
+ if (usage)
+ ms->usage = NULL;
+ kfree(usage);
+ return ERR_PTR(rc);
+ }
+
+ /*
+ * The early init code does not consider partially populated
+ * initial sections, it simply assumes that memory will never be
+ * referenced. If we hot-add memory into such a section then we
+ * do not need to populate the memmap and can simply reuse what
+ * is already there.
+ */
+ if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+ return pfn_to_page(pfn);
+
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ if (!memmap) {
+ section_deactivate(pfn, nr_pages, altmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return memmap;
+}
+
+/**
+ * sparse_add_section - add a memory section, or populate an existing one
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Note that only VMEMMAP supports sub-section aligned hotplug,
+ * the proper alignment and size are gated by check_pfn_span().
+ *
+ *
+ * Return:
+ * * 0 - On success.
+ * * -EEXIST - Section has been present.
+ * * -ENOMEM - Out of memory.
+ */
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct mem_section *ms;
+ struct page *memmap;
+ int ret;
+
+ ret = sparse_index_init(section_nr, nid);
+ if (ret < 0)
+ return ret;
+
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ if (IS_ERR(memmap))
+ return PTR_ERR(memmap);
+
+ /*
+ * Poison uninitialized struct pages in order to catch invalid flags
+ * combinations.
+ */
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
+
+ ms = __nr_to_section(section_nr);
+ set_section_nid(section_nr, nid);
+ section_mark_present(ms);
+
+ /* Align memmap to section boundary in the subsection case */
+ if (section_nr_to_pfn(section_nr) != start_pfn)
+ memmap = pfn_to_page(section_nr_to_pfn(section_nr));
+ sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+ int i;
+
+ /*
+ * A further optimization is to have per section refcounted
+ * num_poisoned_pages. But that would need more space per memmap, so
+ * for now just do a quick global check to speed up this routine in the
+ * absence of bad pages.
+ */
+ if (atomic_long_read(&num_poisoned_pages) == 0)
+ return;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (PageHWPoison(&memmap[i])) {
+ num_poisoned_pages_dec();
+ ClearPageHWPoison(&memmap[i]);
+ }
+ }
+}
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
+ struct vmem_altmap *altmap)
+{
+ clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+ nr_pages - map_offset);
+ section_deactivate(pfn, nr_pages, altmap);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
new file mode 100644
index 000000000..47a47681c
--- /dev/null
+++ b/mm/swap.c
@@ -0,0 +1,1215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/swap.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+/*
+ * This file contains the default values for the operation of the
+ * Linux VM subsystem. Fine-tuning documentation can be found in
+ * Documentation/admin-guide/sysctl/vm.rst.
+ * Started 18.12.91
+ * Swap aging added 23.2.95, Stephen Tweedie.
+ * Buffermem limits added 12.3.98, Rik van Riel.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/mm_inline.h>
+#include <linux/percpu_counter.h>
+#include <linux/memremap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
+#include <linux/gfp.h>
+#include <linux/uio.h>
+#include <linux/hugetlb.h>
+#include <linux/page_idle.h>
+#include <linux/local_lock.h>
+
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagemap.h>
+
+/* How many pages do we try to swap or page in/out together? */
+int page_cluster;
+
+/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+struct lru_rotate {
+ local_lock_t lock;
+ struct pagevec pvec;
+};
+static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+/*
+ * The following struct pagevec are grouped together because they are protected
+ * by disabling preemption (and interrupts remain enabled).
+ */
+struct lru_pvecs {
+ local_lock_t lock;
+ struct pagevec lru_add;
+ struct pagevec lru_deactivate_file;
+ struct pagevec lru_deactivate;
+ struct pagevec lru_lazyfree;
+#ifdef CONFIG_SMP
+ struct pagevec activate_page;
+#endif
+};
+static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs. But it gets used by networking.
+ */
+static void __page_cache_release(struct page *page)
+{
+ if (PageLRU(page)) {
+ pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgdat->lru_lock, flags);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
+ __ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ }
+ __ClearPageWaiters(page);
+}
+
+static void __put_single_page(struct page *page)
+{
+ __page_cache_release(page);
+ mem_cgroup_uncharge(page);
+ free_unref_page(page);
+}
+
+static void __put_compound_page(struct page *page)
+{
+ /*
+ * __page_cache_release() is supposed to be called for thp, not for
+ * hugetlb. This is because hugetlb page does never have PageLRU set
+ * (it's never listed to any LRU lists) and no memcg routines should
+ * be called for hugetlb (it has a separate hugetlb_cgroup.)
+ */
+ if (!PageHuge(page))
+ __page_cache_release(page);
+ destroy_compound_page(page);
+}
+
+void __put_page(struct page *page)
+{
+ if (is_zone_device_page(page)) {
+ put_dev_pagemap(page->pgmap);
+
+ /*
+ * The page belongs to the device that created pgmap. Do
+ * not return it to page allocator.
+ */
+ return;
+ }
+
+ if (unlikely(PageCompound(page)))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+}
+EXPORT_SYMBOL(__put_page);
+
+/**
+ * put_pages_list() - release a list of pages
+ * @pages: list of pages threaded on page->lru
+ *
+ * Release a list of pages which are strung together on page.lru. Currently
+ * used by read_cache_pages() and related error recovery code.
+ */
+void put_pages_list(struct list_head *pages)
+{
+ while (!list_empty(pages)) {
+ struct page *victim;
+
+ victim = lru_to_page(pages);
+ list_del(&victim->lru);
+ put_page(victim);
+ }
+}
+EXPORT_SYMBOL(put_pages_list);
+
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov: An array of struct kvec structures
+ * @nr_segs: number of segments to pin
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+ struct page **pages)
+{
+ int seg;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+ return seg;
+
+ pages[seg] = kmap_to_page(kiov[seg].iov_base);
+ get_page(pages[seg]);
+ }
+
+ return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start: starting kernel address
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointer to the page pinned.
+ * Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+ const struct kvec kiov = {
+ .iov_base = (void *)start,
+ .iov_len = PAGE_SIZE
+ };
+
+ return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
+
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+ void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
+ void *arg)
+{
+ int i;
+ struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec;
+ unsigned long flags = 0;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ struct pglist_data *pagepgdat = page_pgdat(page);
+
+ if (pagepgdat != pgdat) {
+ if (pgdat)
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ pgdat = pagepgdat;
+ spin_lock_irqsave(&pgdat->lru_lock, flags);
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ (*move_fn)(page, lruvec, arg);
+ }
+ if (pgdat)
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
+}
+
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ int *pgmoved = arg;
+
+ if (PageLRU(page) && !PageUnevictable(page)) {
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ ClearPageActive(page);
+ add_page_to_lru_list_tail(page, lruvec, page_lru(page));
+ (*pgmoved) += thp_nr_pages(page);
+ }
+}
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+ int pgmoved = 0;
+
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+ __count_vm_events(PGROTATED, pgmoved);
+}
+
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim. If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void rotate_reclaimable_page(struct page *page)
+{
+ if (!PageLocked(page) && !PageDirty(page) &&
+ !PageUnevictable(page) && PageLRU(page)) {
+ struct pagevec *pvec;
+ unsigned long flags;
+
+ get_page(page);
+ local_lock_irqsave(&lru_rotate.lock, flags);
+ pvec = this_cpu_ptr(&lru_rotate.pvec);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_move_tail(pvec);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
+ }
+}
+
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+{
+ do {
+ unsigned long lrusize;
+
+ /* Record cost event */
+ if (file)
+ lruvec->file_cost += nr_pages;
+ else
+ lruvec->anon_cost += nr_pages;
+
+ /*
+ * Decay previous events
+ *
+ * Because workloads change over time (and to avoid
+ * overflow) we keep these statistics as a floating
+ * average, which ends up weighing recent refaults
+ * more than old ones.
+ */
+ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+ if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+ lruvec->file_cost /= 2;
+ lruvec->anon_cost /= 2;
+ }
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+
+void lru_note_cost_page(struct page *page)
+{
+ lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ page_is_file_lru(page), thp_nr_pages(page));
+}
+
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+ int nr_pages = thp_nr_pages(page);
+
+ del_page_from_lru_list(page, lruvec, lru);
+ SetPageActive(page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(page, lruvec, lru);
+ trace_mm_lru_activate(page);
+
+ __count_vm_events(PGACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
+ nr_pages);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void activate_page_drain(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
+
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+static bool need_activate_page_drain(int cpu)
+{
+ return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
+}
+
+static void activate_page(struct page *page)
+{
+ page = compound_head(page);
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.activate_page);
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ local_unlock(&lru_pvecs.lock);
+ }
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+static void activate_page(struct page *page)
+{
+ pg_data_t *pgdat = page_pgdat(page);
+
+ page = compound_head(page);
+ spin_lock_irq(&pgdat->lru_lock);
+ __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
+ spin_unlock_irq(&pgdat->lru_lock);
+}
+#endif
+
+static void __lru_cache_activate_page(struct page *page)
+{
+ struct pagevec *pvec;
+ int i;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+
+ /*
+ * Search backwards on the optimistic assumption that the page being
+ * activated has just been added to this pagevec. Note that only
+ * the local pagevec is examined as a !PageLRU page could be in the
+ * process of being released, reclaimed, migrated or on a remote
+ * pagevec that is currently being drained. Furthermore, marking
+ * a remote pagevec's page PageActive potentially hits a race where
+ * a page is marked PageActive just after it is added to the inactive
+ * list causing accounting errors and BUG_ON checks to trigger.
+ */
+ for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
+ struct page *pagevec_page = pvec->pages[i];
+
+ if (pagevec_page == page) {
+ SetPageActive(page);
+ break;
+ }
+ }
+
+ local_unlock(&lru_pvecs.lock);
+}
+
+/*
+ * Mark a page as having seen activity.
+ *
+ * inactive,unreferenced -> inactive,referenced
+ * inactive,referenced -> active,unreferenced
+ * active,unreferenced -> active,referenced
+ *
+ * When a newly allocated page is not yet visible, so safe for non-atomic ops,
+ * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
+ */
+void mark_page_accessed(struct page *page)
+{
+ page = compound_head(page);
+
+ if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ } else if (PageUnevictable(page)) {
+ /*
+ * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
+ * this list is never rotated or maintained, so marking an
+ * evictable page accessed has no effect.
+ */
+ } else if (!PageActive(page)) {
+ /*
+ * If the page is on the LRU, queue it for activation via
+ * lru_pvecs.activate_page. Otherwise, assume the page is on a
+ * pagevec, mark it active and it'll be moved to the active
+ * LRU on the next drain.
+ */
+ if (PageLRU(page))
+ activate_page(page);
+ else
+ __lru_cache_activate_page(page);
+ ClearPageReferenced(page);
+ workingset_activation(page);
+ }
+ if (page_is_idle(page))
+ clear_page_idle(page);
+}
+EXPORT_SYMBOL(mark_page_accessed);
+
+/**
+ * lru_cache_add - add a page to a page list
+ * @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
+ */
+void lru_cache_add(struct page *page)
+{
+ struct pagevec *pvec;
+
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ get_page(page);
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ __pagevec_lru_add(pvec);
+ local_unlock(&lru_pvecs.lock);
+}
+EXPORT_SYMBOL(lru_cache_add);
+
+/**
+ * lru_cache_add_inactive_or_unevictable
+ * @page: the page to be added to LRU
+ * @vma: vma in which page is mapped for determining reclaimability
+ *
+ * Place @page on the inactive or unevictable LRU list, depending on its
+ * evictability.
+ */
+void lru_cache_add_inactive_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ bool unevictable;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
+ if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
+ /*
+ * We use the irq-unsafe __mod_zone_page_stat because this
+ * counter is not modified from interrupt context, and the pte
+ * lock is held(spinlock), which implies preemption disabled.
+ */
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+ }
+ lru_cache_add(page);
+}
+
+/*
+ * If the page can not be invalidated, it is moved to the
+ * inactive list to speed up its reclaim. It is moved to the
+ * head of the list, rather than the tail, to give the flusher
+ * threads some time to write it out, as this is much more
+ * effective than the single-page writeout from reclaim.
+ *
+ * If the page isn't page_mapped and dirty/writeback, the page
+ * could reclaim asap using PG_reclaim.
+ *
+ * 1. active, mapped page -> none
+ * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
+ * 3. inactive, mapped page -> none
+ * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 5. inactive, clean -> inactive, tail
+ * 6. Others -> none
+ *
+ * In 4, why it moves inactive's head, the VM expects the page would
+ * be write it out by flusher threads as this is much more effective
+ * than the single-page writeout from reclaim.
+ */
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ int lru;
+ bool active;
+ int nr_pages = thp_nr_pages(page);
+
+ if (!PageLRU(page))
+ return;
+
+ if (PageUnevictable(page))
+ return;
+
+ /* Some processes are using the page */
+ if (page_mapped(page))
+ return;
+
+ active = PageActive(page);
+ lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+
+ if (PageWriteback(page) || PageDirty(page)) {
+ /*
+ * PG_reclaim could be raced with end_page_writeback
+ * It can make readahead confusing. But race window
+ * is _really_ small and it's non-critical problem.
+ */
+ add_page_to_lru_list(page, lruvec, lru);
+ SetPageReclaim(page);
+ } else {
+ /*
+ * The page's writeback ends up during pagevec
+ * We moves tha page into tail of inactive.
+ */
+ add_page_to_lru_list_tail(page, lruvec, lru);
+ __count_vm_events(PGROTATED, nr_pages);
+ }
+
+ if (active) {
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
+ }
+}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int lru = page_lru_base_type(page);
+ int nr_pages = thp_nr_pages(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
+ }
+}
+
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page) && !PageUnevictable(page)) {
+ bool active = PageActive(page);
+ int nr_pages = thp_nr_pages(page);
+
+ del_page_from_lru_list(page, lruvec,
+ LRU_INACTIVE_ANON + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ /*
+ * Lazyfree pages are clean anonymous pages. They have
+ * PG_swapbacked flag cleared, to distinguish them from normal
+ * anonymous pages
+ */
+ ClearPageSwapBacked(page);
+ add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+
+ __count_vm_events(PGLAZYFREE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
+ nr_pages);
+ }
+}
+
+/*
+ * Drain pages out of the cpu's pagevecs.
+ * Either "cpu" is the current CPU, and preemption has already been
+ * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ */
+void lru_add_drain_cpu(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
+
+ if (pagevec_count(pvec))
+ __pagevec_lru_add(pvec);
+
+ pvec = &per_cpu(lru_rotate.pvec, cpu);
+ /* Disabling interrupts below acts as a compiler barrier. */
+ if (data_race(pagevec_count(pvec))) {
+ unsigned long flags;
+
+ /* No harm done if a racing interrupt already did this */
+ local_lock_irqsave(&lru_rotate.lock, flags);
+ pagevec_move_tail(pvec);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
+ }
+
+ pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+
+ pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
+ pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+
+ activate_page_drain(cpu);
+}
+
+/**
+ * deactivate_file_page - forcefully deactivate a file page
+ * @page: page to deactivate
+ *
+ * This function hints the VM that @page is a good reclaim candidate,
+ * for example if its invalidation fails due to the page being dirty
+ * or under writeback.
+ */
+void deactivate_file_page(struct page *page)
+{
+ /*
+ * In a workload with many unevictable page such as mprotect,
+ * unevictable page deactivation for accelerating reclaim is pointless.
+ */
+ if (PageUnevictable(page))
+ return;
+
+ if (likely(get_page_unless_zero(page))) {
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
+
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ local_unlock(&lru_pvecs.lock);
+ }
+}
+
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ local_unlock(&lru_pvecs.lock);
+ }
+}
+
+/**
+ * mark_page_lazyfree - make an anon page lazyfree
+ * @page: page to deactivate
+ *
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
+ */
+void mark_page_lazyfree(struct page *page)
+{
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ local_unlock(&lru_pvecs.lock);
+ }
+}
+
+void lru_add_drain(void)
+{
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ local_unlock(&lru_pvecs.lock);
+}
+
+void lru_add_drain_cpu_zone(struct zone *zone)
+{
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ drain_local_pages(zone);
+ local_unlock(&lru_pvecs.lock);
+}
+
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+static void lru_add_drain_per_cpu(struct work_struct *dummy)
+{
+ lru_add_drain();
+}
+
+/*
+ * Doesn't need any cpu hotplug locking because we do rely on per-cpu
+ * kworkers being shut down before our page_alloc_cpu_dead callback is
+ * executed on the offlined cpu.
+ * Calling this function with cpu hotplug locks held can actually lead
+ * to obscure indirect dependencies via WQ context.
+ */
+void lru_add_drain_all(void)
+{
+ /*
+ * lru_drain_gen - Global pages generation number
+ *
+ * (A) Definition: global lru_drain_gen = x implies that all generations
+ * 0 < n <= x are already *scheduled* for draining.
+ *
+ * This is an optimization for the highly-contended use case where a
+ * user space workload keeps constantly generating a flow of pages for
+ * each CPU.
+ */
+ static unsigned int lru_drain_gen;
+ static struct cpumask has_work;
+ static DEFINE_MUTEX(lock);
+ unsigned cpu, this_gen;
+
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;
+
+ /*
+ * Guarantee pagevec counter stores visible by this CPU are visible to
+ * other CPUs before loading the current drain generation.
+ */
+ smp_mb();
+
+ /*
+ * (B) Locally cache global LRU draining generation number
+ *
+ * The read barrier ensures that the counter is loaded before the mutex
+ * is taken. It pairs with smp_mb() inside the mutex critical section
+ * at (D).
+ */
+ this_gen = smp_load_acquire(&lru_drain_gen);
+
+ mutex_lock(&lock);
+
+ /*
+ * (C) Exit the draining operation if a newer generation, from another
+ * lru_add_drain_all(), was already scheduled for draining. Check (A).
+ */
+ if (unlikely(this_gen != lru_drain_gen))
+ goto done;
+
+ /*
+ * (D) Increment global generation number
+ *
+ * Pairs with smp_load_acquire() at (B), outside of the critical
+ * section. Use a full memory barrier to guarantee that the new global
+ * drain generation number is stored before loading pagevec counters.
+ *
+ * This pairing must be done here, before the for_each_online_cpu loop
+ * below which drains the page vectors.
+ *
+ * Let x, y, and z represent some system CPU numbers, where x < y < z.
+ * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * below and has already reached CPU #y's per-cpu data. CPU #x comes
+ * along, adds some pages to its per-cpu vectors, then calls
+ * lru_add_drain_all().
+ *
+ * If the paired barrier is done at any later step, e.g. after the
+ * loop, CPU #x will just exit at (C) and miss flushing out all of its
+ * added pages.
+ */
+ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
+ smp_mb();
+
+ cpumask_clear(&has_work);
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
+ need_activate_page_drain(cpu)) {
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ queue_work_on(cpu, mm_percpu_wq, work);
+ __cpumask_set_cpu(cpu, &has_work);
+ }
+ }
+
+ for_each_cpu(cpu, &has_work)
+ flush_work(&per_cpu(lru_add_drain_work, cpu));
+
+done:
+ mutex_unlock(&lock);
+}
+#else
+void lru_add_drain_all(void)
+{
+ lru_add_drain();
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * release_pages - batched put_page()
+ * @pages: array of pages to release
+ * @nr: number of pages
+ *
+ * Decrement the reference count on all the pages in @pages. If it
+ * fell to zero, remove the page from the LRU and free it.
+ */
+void release_pages(struct page **pages, int nr)
+{
+ int i;
+ LIST_HEAD(pages_to_free);
+ struct pglist_data *locked_pgdat = NULL;
+ struct lruvec *lruvec;
+ unsigned long flags;
+ unsigned int lock_batch;
+
+ for (i = 0; i < nr; i++) {
+ struct page *page = pages[i];
+
+ /*
+ * Make sure the IRQ-safe lock-holding time does not get
+ * excessive with a continuous string of pages from the
+ * same pgdat. The lock is held only if pgdat != NULL.
+ */
+ if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ locked_pgdat = NULL;
+ }
+
+ page = compound_head(page);
+ if (is_huge_zero_page(page))
+ continue;
+
+ if (is_zone_device_page(page)) {
+ if (locked_pgdat) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ flags);
+ locked_pgdat = NULL;
+ }
+ /*
+ * ZONE_DEVICE pages that return 'false' from
+ * page_is_devmap_managed() do not require special
+ * processing, and instead, expect a call to
+ * put_page_testzero().
+ */
+ if (page_is_devmap_managed(page)) {
+ put_devmap_managed_page(page);
+ continue;
+ }
+ }
+
+ if (!put_page_testzero(page))
+ continue;
+
+ if (PageCompound(page)) {
+ if (locked_pgdat) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ locked_pgdat = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
+ if (PageLRU(page)) {
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ if (pgdat != locked_pgdat) {
+ if (locked_pgdat)
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ flags);
+ lock_batch = 0;
+ locked_pgdat = pgdat;
+ spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
+ __ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ }
+
+ __ClearPageWaiters(page);
+
+ list_add(&page->lru, &pages_to_free);
+ }
+ if (locked_pgdat)
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+
+ mem_cgroup_uncharge_list(&pages_to_free);
+ free_unref_page_list(&pages_to_free);
+}
+EXPORT_SYMBOL(release_pages);
+
+/*
+ * The pages which we're about to release may be in the deferred lru-addition
+ * queues. That would prevent them from really being freed right now. That's
+ * OK from a correctness point of view but is inefficient - those pages may be
+ * cache-warm and we want to give them back to the page allocator ASAP.
+ *
+ * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
+ * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * mutual recursion.
+ */
+void __pagevec_release(struct pagevec *pvec)
+{
+ if (!pvec->percpu_pvec_drained) {
+ lru_add_drain();
+ pvec->percpu_pvec_drained = true;
+ }
+ release_pages(pvec->pages, pagevec_count(pvec));
+ pagevec_reinit(pvec);
+}
+EXPORT_SYMBOL(__pagevec_release);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct page *page, struct page *page_tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
+ lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+
+ if (!list)
+ SetPageLRU(page_tail);
+
+ if (likely(PageLRU(page)))
+ list_add_tail(&page_tail->lru, &page->lru);
+ else if (list) {
+ /* page reclaim is reclaiming a huge page */
+ get_page(page_tail);
+ list_add_tail(&page_tail->lru, list);
+ } else {
+ /*
+ * Head page has not yet been counted, as an hpage,
+ * so we must account for each subpage individually.
+ *
+ * Put page_tail on the list at the correct position
+ * so they all end up in order.
+ */
+ add_page_to_lru_list_tail(page_tail, lruvec,
+ page_lru(page_tail));
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ enum lru_list lru;
+ int was_unevictable = TestClearPageUnevictable(page);
+ int nr_pages = thp_nr_pages(page);
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ /*
+ * Page becomes evictable in two ways:
+ * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
+ * 2) Before acquiring LRU lock to put the page to correct LRU and then
+ * a) do PageLRU check with lock [check_move_unevictable_pages]
+ * b) do PageLRU check before lock [clear_page_mlock]
+ *
+ * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
+ * following strict ordering:
+ *
+ * #0: __pagevec_lru_add_fn #1: clear_page_mlock
+ *
+ * SetPageLRU() TestClearPageMlocked()
+ * smp_mb() // explicit ordering // above provides strict
+ * // ordering
+ * PageMlocked() PageLRU()
+ *
+ *
+ * if '#1' does not observe setting of PG_lru by '#0' and fails
+ * isolation, the explicit barrier will make sure that page_evictable
+ * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
+ * can be reordered after PageMlocked check and can make '#1' to fail
+ * the isolation of the page whose Mlocked bit is cleared (#0 is also
+ * looking at the same page) and the evictable page will be stranded
+ * in an unevictable LRU.
+ */
+ SetPageLRU(page);
+ smp_mb__after_atomic();
+
+ if (page_evictable(page)) {
+ lru = page_lru(page);
+ if (was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ } else {
+ lru = LRU_UNEVICTABLE;
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ if (!was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+ }
+
+ add_page_to_lru_list(page, lruvec, lru);
+ trace_mm_lru_insertion(page, lru);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them. Reinitialises the caller's pagevec.
+ */
+void __pagevec_lru_add(struct pagevec *pvec)
+{
+ pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
+}
+
+/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of pages
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_pages pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * Only one subpage of a Transparent Huge Page is returned in one call:
+ * allowing truncate_inode_pages_range() to evict the whole THP without
+ * cycling through a pagevec of extra references.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_entries,
+ pgoff_t *indices)
+{
+ pvec->nr = find_get_entries(mapping, start, nr_entries,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
+}
+
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
+ */
+void pagevec_remove_exceptionals(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!xa_is_value(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+}
+
+/**
+ * pagevec_lookup_range - gang pagecache lookup
+ * @pvec: Where the resulting pages are placed
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @end: The final page index
+ *
+ * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes. There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
+ *
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than PAGEVEC_SIZE, the end of specified range has been
+ * reached.
+ */
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *start, pgoff_t end)
+{
+ pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
+ pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range);
+
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ xa_mark_t tag)
+{
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ PAGEVEC_SIZE, pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_tag);
+
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ xa_mark_t tag, unsigned max_pages)
+{
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
+/*
+ * Perform any setup for the swap system
+ */
+void __init swap_setup(void)
+{
+ unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
+
+ /* Use a smaller cluster for small-memory machines */
+ if (megs < 16)
+ page_cluster = 2;
+ else
+ page_cluster = 3;
+ /*
+ * Right now other parts of the system means that we
+ * _really_ don't want to cluster much more
+ */
+}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void put_devmap_managed_page(struct page *page)
+{
+ int count;
+
+ if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
+ return;
+
+ count = page_ref_dec_return(page);
+
+ /*
+ * devmap page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (count == 1)
+ free_devmap_managed_page(page);
+ else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_devmap_managed_page);
+#endif
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000..7f34343c0
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/swap_cgroup.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include <linux/swapops.h> /* depends on mm.h include */
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+ struct page **map;
+ unsigned long length;
+ spinlock_t lock;
+};
+
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+struct swap_cgroup {
+ unsigned short id;
+};
+#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ * - we have no race in "exchange" when we're accessed via SwapCache because
+ * SwapCache(and its swp_entry) is under lock.
+ * - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+ struct page *page;
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long idx, max;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ for (idx = 0; idx < ctrl->length; idx++) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto not_enough_page;
+ ctrl->map[idx] = page;
+
+ if (!(idx % SWAP_CLUSTER_MAX))
+ cond_resched();
+ }
+ return 0;
+not_enough_page:
+ max = idx;
+ for (idx = 0; idx < max; idx++)
+ __free_page(ctrl->map[idx]);
+
+ return -ENOMEM;
+}
+
+static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
+ pgoff_t offset)
+{
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+ struct swap_cgroup_ctrl **ctrlp)
+{
+ pgoff_t offset = swp_offset(ent);
+ struct swap_cgroup_ctrl *ctrl;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ if (ctrlp)
+ *ctrlp = ctrl;
+ return __lookup_swap_cgroup(ctrl, offset);
+}
+
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned long flags;
+ unsigned short retval;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ retval = sc->id;
+ if (retval == old)
+ sc->id = new;
+ else
+ retval = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return retval;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * @ent: the first swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ * @nr_ents: number of swap entries to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+ unsigned int nr_ents)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned short old;
+ unsigned long flags;
+ pgoff_t offset = swp_offset(ent);
+ pgoff_t end = offset + nr_ents;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ old = sc->id;
+ for (;;) {
+ VM_BUG_ON(sc->id != old);
+ sc->id = id;
+ offset++;
+ if (offset == end)
+ break;
+ if (offset % SC_PER_PAGE)
+ sc++;
+ else
+ sc = __lookup_swap_cgroup(ctrl, offset);
+ }
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ return old;
+}
+
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+ return lookup_swap_cgroup(ent, NULL)->id;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ void *array;
+ unsigned long array_size;
+ unsigned long length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+ array_size = length * sizeof(void *);
+
+ array = vzalloc(array_size);
+ if (!array)
+ goto nomem;
+
+ ctrl = &swap_cgroup_ctrl[type];
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl->length = length;
+ ctrl->map = array;
+ spin_lock_init(&ctrl->lock);
+ if (swap_cgroup_prepare(type)) {
+ /* memory shortage */
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
+ goto nomem;
+ }
+ mutex_unlock(&swap_cgroup_mutex);
+
+ return 0;
+nomem:
+ pr_info("couldn't allocate enough memory for swap_cgroup\n");
+ pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+ struct page **map;
+ unsigned long i, length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl = &swap_cgroup_ctrl[type];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
+ if (page)
+ __free_page(page);
+ if (!(i % SWAP_CLUSTER_MAX))
+ cond_resched();
+ }
+ vfree(map);
+ }
+}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
new file mode 100644
index 000000000..0357fbe70
--- /dev/null
+++ b/mm/swap_slots.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Manage cache of swap slots to be used for and returned from
+ * swap.
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * We allocate the swap slots from the global pool and put
+ * it into local per cpu caches. This has the advantage
+ * of no needing to acquire the swap_info lock every time
+ * we need a new slot.
+ *
+ * There is also opportunity to simply return the slot
+ * to local caches without needing to acquire swap_info
+ * lock. We do not reuse the returned slots directly but
+ * move them back to the global pool in a batch. This
+ * allows the slots to coaellesce and reduce fragmentation.
+ *
+ * The swap entry allocated is marked with SWAP_HAS_CACHE
+ * flag in map_count that prevents it from being allocated
+ * again from the global pool.
+ *
+ * The swap slots cache is protected by a mutex instead of
+ * a spin lock as when we search for slots with scan_swap_map,
+ * we can possibly sleep.
+ */
+
+#include <linux/swap_slots.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+
+static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+static bool swap_slot_cache_active;
+bool swap_slot_cache_enabled;
+static bool swap_slot_cache_initialized;
+static DEFINE_MUTEX(swap_slots_cache_mutex);
+/* Serialize swap slots cache enable/disable operations */
+static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+
+static void __drain_swap_slots_cache(unsigned int type);
+static void deactivate_swap_slots_cache(void);
+static void reactivate_swap_slots_cache(void);
+
+#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
+#define SLOTS_CACHE 0x1
+#define SLOTS_CACHE_RET 0x2
+
+static void deactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = false;
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+static void reactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = true;
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+/* Must not be called with cpu hot plug lock */
+void disable_swap_slots_cache_lock(void)
+{
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ swap_slot_cache_enabled = false;
+ if (swap_slot_cache_initialized) {
+ /* serialize with cpu hotplug operations */
+ get_online_cpus();
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ put_online_cpus();
+ }
+}
+
+static void __reenable_swap_slots_cache(void)
+{
+ swap_slot_cache_enabled = has_usable_swap();
+}
+
+void reenable_swap_slots_cache_unlock(void)
+{
+ __reenable_swap_slots_cache();
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+static bool check_cache_active(void)
+{
+ long pages;
+
+ if (!swap_slot_cache_enabled)
+ return false;
+
+ pages = get_nr_swap_pages();
+ if (!swap_slot_cache_active) {
+ if (pages > num_online_cpus() *
+ THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
+ reactivate_swap_slots_cache();
+ goto out;
+ }
+
+ /* if global pool of slot caches too low, deactivate cache */
+ if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
+ deactivate_swap_slots_cache();
+out:
+ return swap_slot_cache_active;
+}
+
+static int alloc_swap_slot_cache(unsigned int cpu)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots, *slots_ret;
+
+ /*
+ * Do allocation outside swap_slots_cache_mutex
+ * as kvzalloc could trigger reclaim and get_swap_page,
+ * which can lock swap_slots_cache_mutex.
+ */
+ slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
+ GFP_KERNEL);
+ if (!slots)
+ return -ENOMEM;
+
+ slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
+ GFP_KERNEL);
+ if (!slots_ret) {
+ kvfree(slots);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&swap_slots_cache_mutex);
+ cache = &per_cpu(swp_slots, cpu);
+ if (cache->slots || cache->slots_ret) {
+ /* cache already allocated */
+ mutex_unlock(&swap_slots_cache_mutex);
+
+ kvfree(slots);
+ kvfree(slots_ret);
+
+ return 0;
+ }
+
+ if (!cache->lock_initialized) {
+ mutex_init(&cache->alloc_lock);
+ spin_lock_init(&cache->free_lock);
+ cache->lock_initialized = true;
+ }
+ cache->nr = 0;
+ cache->cur = 0;
+ cache->n_ret = 0;
+ /*
+ * We initialized alloc_lock and free_lock earlier. We use
+ * !cache->slots or !cache->slots_ret to know if it is safe to acquire
+ * the corresponding lock and use the cache. Memory barrier below
+ * ensures the assumption.
+ */
+ mb();
+ cache->slots = slots;
+ cache->slots_ret = slots_ret;
+ mutex_unlock(&swap_slots_cache_mutex);
+ return 0;
+}
+
+static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
+ bool free_slots)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots = NULL;
+
+ cache = &per_cpu(swp_slots, cpu);
+ if ((type & SLOTS_CACHE) && cache->slots) {
+ mutex_lock(&cache->alloc_lock);
+ swapcache_free_entries(cache->slots + cache->cur, cache->nr);
+ cache->cur = 0;
+ cache->nr = 0;
+ if (free_slots && cache->slots) {
+ kvfree(cache->slots);
+ cache->slots = NULL;
+ }
+ mutex_unlock(&cache->alloc_lock);
+ }
+ if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
+ spin_lock_irq(&cache->free_lock);
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ if (free_slots && cache->slots_ret) {
+ slots = cache->slots_ret;
+ cache->slots_ret = NULL;
+ }
+ spin_unlock_irq(&cache->free_lock);
+ if (slots)
+ kvfree(slots);
+ }
+}
+
+static void __drain_swap_slots_cache(unsigned int type)
+{
+ unsigned int cpu;
+
+ /*
+ * This function is called during
+ * 1) swapoff, when we have to make sure no
+ * left over slots are in cache when we remove
+ * a swap device;
+ * 2) disabling of swap slot cache, when we run low
+ * on swap slots when allocating memory and need
+ * to return swap slots to global pool.
+ *
+ * We cannot acquire cpu hot plug lock here as
+ * this function can be invoked in the cpu
+ * hot plug path:
+ * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
+ * -> memory allocation -> direct reclaim -> get_swap_page
+ * -> drain_swap_slots_cache
+ *
+ * Hence the loop over current online cpu below could miss cpu that
+ * is being brought online but not yet marked as online.
+ * That is okay as we do not schedule and run anything on a
+ * cpu before it has been marked online. Hence, we will not
+ * fill any swap slots in slots cache of such cpu.
+ * There are no slots on such cpu that need to be drained.
+ */
+ for_each_online_cpu(cpu)
+ drain_slots_cache_cpu(cpu, type, false);
+}
+
+static int free_slot_cache(unsigned int cpu)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+ mutex_unlock(&swap_slots_cache_mutex);
+ return 0;
+}
+
+void enable_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ if (!swap_slot_cache_initialized) {
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
+ goto out_unlock;
+
+ swap_slot_cache_initialized = true;
+ }
+
+ __reenable_swap_slots_cache();
+out_unlock:
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+/* called with swap slot cache's alloc lock held */
+static int refill_swap_slots_cache(struct swap_slots_cache *cache)
+{
+ if (!use_swap_slot_cache || cache->nr)
+ return 0;
+
+ cache->cur = 0;
+ if (swap_slot_cache_active)
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+ cache->slots, 1);
+
+ return cache->nr;
+}
+
+int free_swap_slot(swp_entry_t entry)
+{
+ struct swap_slots_cache *cache;
+
+ cache = raw_cpu_ptr(&swp_slots);
+ if (likely(use_swap_slot_cache && cache->slots_ret)) {
+ spin_lock_irq(&cache->free_lock);
+ /* Swap slots cache may be deactivated before acquiring lock */
+ if (!use_swap_slot_cache || !cache->slots_ret) {
+ spin_unlock_irq(&cache->free_lock);
+ goto direct_free;
+ }
+ if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
+ /*
+ * Return slots to global pool.
+ * The current swap_map value is SWAP_HAS_CACHE.
+ * Set it to 0 to indicate it is available for
+ * allocation in global pool
+ */
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ }
+ cache->slots_ret[cache->n_ret++] = entry;
+ spin_unlock_irq(&cache->free_lock);
+ } else {
+direct_free:
+ swapcache_free_entries(&entry, 1);
+ }
+
+ return 0;
+}
+
+swp_entry_t get_swap_page(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_slots_cache *cache;
+
+ entry.val = 0;
+
+ if (PageTransHuge(page)) {
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ get_swap_pages(1, &entry, HPAGE_PMD_NR);
+ goto out;
+ }
+
+ /*
+ * Preemption is allowed here, because we may sleep
+ * in refill_swap_slots_cache(). But it is safe, because
+ * accesses to the per-CPU data structure are protected by the
+ * mutex cache->alloc_lock.
+ *
+ * The alloc path here does not touch cache->slots_ret
+ * so cache->free_lock is not taken.
+ */
+ cache = raw_cpu_ptr(&swp_slots);
+
+ if (likely(check_cache_active() && cache->slots)) {
+ mutex_lock(&cache->alloc_lock);
+ if (cache->slots) {
+repeat:
+ if (cache->nr) {
+ entry = cache->slots[cache->cur];
+ cache->slots[cache->cur++].val = 0;
+ cache->nr--;
+ } else if (refill_swap_slots_cache(cache)) {
+ goto repeat;
+ }
+ }
+ mutex_unlock(&cache->alloc_lock);
+ if (entry.val)
+ goto out;
+ }
+
+ get_swap_pages(1, &entry, 1);
+out:
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ put_swap_page(page, entry);
+ entry.val = 0;
+ }
+ return entry;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
new file mode 100644
index 000000000..5c5cb2d67
--- /dev/null
+++ b/mm/swap_state.c
@@ -0,0 +1,953 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/swap_state.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ *
+ * Rewritten to use page cache, (C) 1998 Stephen Tweedie
+ */
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/migrate.h>
+#include <linux/vmalloc.h>
+#include <linux/swap_slots.h>
+#include <linux/huge_mm.h>
+#include <linux/shmem_fs.h>
+#include "internal.h"
+
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_page_list.
+ */
+static const struct address_space_operations swap_aops = {
+ .writepage = swap_writepage,
+ .set_page_dirty = swap_set_page_dirty,
+#ifdef CONFIG_MIGRATION
+ .migratepage = migrate_page,
+#endif
+};
+
+struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+static bool enable_vma_readahead __read_mostly = true;
+
+#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+
+#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
+
+#define SWAP_RA_VAL(addr, win, hits) \
+ (((addr) & PAGE_MASK) | \
+ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
+ ((hits) & SWAP_RA_HITS_MASK))
+
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma) \
+ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
+
+#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
+#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
+
+static struct {
+ unsigned long add_total;
+ unsigned long del_total;
+ unsigned long find_success;
+ unsigned long find_total;
+} swap_cache_info;
+
+unsigned long total_swapcache_pages(void)
+{
+ unsigned int i, j, nr;
+ unsigned long ret = 0;
+ struct address_space *spaces;
+ struct swap_info_struct *si;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ swp_entry_t entry = swp_entry(i, 1);
+
+ /* Avoid get_swap_device() to warn for bad swap entry */
+ if (!swp_swap_info(entry))
+ continue;
+ /* Prevent swapoff to free swapper_spaces */
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ nr = nr_swapper_spaces[i];
+ spaces = swapper_spaces[i];
+ for (j = 0; j < nr; j++)
+ ret += spaces[j].nrpages;
+ put_swap_device(si);
+ }
+ return ret;
+}
+
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
+void show_swap_cache_info(void)
+{
+ printk("%lu pages in swap cache\n", total_swapcache_pages());
+ printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
+ swap_cache_info.add_total, swap_cache_info.del_total,
+ swap_cache_info.find_success, swap_cache_info.find_total);
+ printk("Free swap = %ldkB\n",
+ get_nr_swap_pages() << (PAGE_SHIFT - 10));
+ printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+}
+
+void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ pgoff_t idx = swp_offset(entry);
+ struct page *page;
+
+ page = find_get_entry(address_space, idx);
+ if (xa_is_value(page))
+ return page;
+ if (page)
+ put_page(page);
+ return NULL;
+}
+
+/*
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ pgoff_t idx = swp_offset(entry);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
+ unsigned long i, nr = thp_nr_pages(page);
+ void *old;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+
+ page_ref_add(page, nr);
+ SetPageSwapCache(page);
+
+ do {
+ unsigned long nr_shadows = 0;
+
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < nr; i++) {
+ VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ old = xas_load(&xas);
+ if (xa_is_value(old)) {
+ nr_shadows++;
+ if (shadowp)
+ *shadowp = old;
+ }
+ set_page_private(page + i, entry.val + i);
+ xas_store(&xas, page);
+ xas_next(&xas);
+ }
+ address_space->nrexceptional -= nr_shadows;
+ address_space->nrpages += nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ ADD_CACHE_INFO(add_total, nr);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (!xas_error(&xas))
+ return 0;
+
+ ClearPageSwapCache(page);
+ page_ref_sub(page, nr);
+ return xas_error(&xas);
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache.
+ */
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ int i, nr = thp_nr_pages(page);
+ pgoff_t idx = swp_offset(entry);
+ XA_STATE(xas, &address_space->i_pages, idx);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+
+ for (i = 0; i < nr; i++) {
+ void *entry = xas_store(&xas, shadow);
+ VM_BUG_ON_PAGE(entry != page, entry);
+ set_page_private(page + i, 0);
+ xas_next(&xas);
+ }
+ ClearPageSwapCache(page);
+ if (shadow)
+ address_space->nrexceptional += nr;
+ address_space->nrpages -= nr;
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ ADD_CACHE_INFO(del_total, nr);
+}
+
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache. Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page *page)
+{
+ swp_entry_t entry;
+ int err;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ entry = get_swap_page(page);
+ if (!entry.val)
+ return 0;
+
+ /*
+ * XArray node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ /*
+ * Add it to the swap cache.
+ */
+ err = add_to_swap_cache(page, entry,
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
+ if (err)
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
+ goto fail;
+ /*
+ * Normally the page will be dirtied in unmap because its pte should be
+ * dirty. A special case is MADV_FREE page. The page's pte could have
+ * dirty bit cleared but the page's SwapBacked bit is still set because
+ * clearing the dirty bit and SwapBacked bit has no lock protected. For
+ * such page, unmap will not set dirty bit for it, so page reclaim will
+ * not write the page out. This can cause data corruption when the page
+ * is swap in later. Always setting the dirty bit for the page solves
+ * the problem.
+ */
+ set_page_dirty(page);
+
+ return 1;
+
+fail:
+ put_swap_page(page, entry);
+ return 0;
+}
+
+/*
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
+ */
+void delete_from_swap_cache(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page) };
+ struct address_space *address_space = swap_address_space(entry);
+
+ xa_lock_irq(&address_space->i_pages);
+ __delete_from_swap_cache(page, entry, NULL);
+ xa_unlock_irq(&address_space->i_pages);
+
+ put_swap_page(page, entry);
+ page_ref_sub(page, thp_nr_pages(page));
+}
+
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+ unsigned long curr = begin;
+ void *old;
+
+ for (;;) {
+ unsigned long nr_shadows = 0;
+ swp_entry_t entry = swp_entry(type, curr);
+ struct address_space *address_space = swap_address_space(entry);
+ XA_STATE(xas, &address_space->i_pages, curr);
+
+ xa_lock_irq(&address_space->i_pages);
+ xas_for_each(&xas, old, end) {
+ if (!xa_is_value(old))
+ continue;
+ xas_store(&xas, NULL);
+ nr_shadows++;
+ }
+ address_space->nrexceptional -= nr_shadows;
+ xa_unlock_irq(&address_space->i_pages);
+
+ /* search the next swapcache until we meet end */
+ curr >>= SWAP_ADDRESS_SPACE_SHIFT;
+ curr++;
+ curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ if (curr > end)
+ break;
+ }
+}
+
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
+ * Its ok to check for PageSwapCache without the page lock
+ * here because we are going to recheck again inside
+ * try_to_free_swap() _with_ the lock.
+ * - Marcelo
+ */
+static inline void free_swap_cache(struct page *page)
+{
+ if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
+ try_to_free_swap(page);
+ unlock_page(page);
+ }
+}
+
+/*
+ * Perform a free_page(), also freeing any swap cache associated with
+ * this page if it is the last user of the page.
+ */
+void free_page_and_swap_cache(struct page *page)
+{
+ free_swap_cache(page);
+ if (!is_huge_zero_page(page))
+ put_page(page);
+}
+
+/*
+ * Passed an array of pages, drop them all from swapcache and then release
+ * them. They are removed from the LRU and freed if this is their last use.
+ */
+void free_pages_and_swap_cache(struct page **pages, int nr)
+{
+ struct page **pagep = pages;
+ int i;
+
+ lru_add_drain();
+ for (i = 0; i < nr; i++)
+ free_swap_cache(pagep[i]);
+ release_pages(pagep, nr);
+}
+
+static inline bool swap_use_vma_readahead(void)
+{
+ return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
+}
+
+/*
+ * Lookup a swap entry in the swap cache. A found page will be returned
+ * unlocked and with its refcount incremented - we rely on the kernel
+ * lock getting page table operations atomic even if we drop the page
+ * lock before returning.
+ */
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ struct swap_info_struct *si;
+
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
+
+ INC_CACHE_INFO(find_total);
+ if (page) {
+ bool vma_ra = swap_use_vma_readahead();
+ bool readahead;
+
+ INC_CACHE_INFO(find_success);
+ /*
+ * At the moment, we don't support PG_readahead for anon THP
+ * so let's bail out rather than confusing the readahead stat.
+ */
+ if (unlikely(PageTransCompound(page)))
+ return page;
+
+ readahead = TestClearPageReadahead(page);
+ if (vma && vma_ra) {
+ unsigned long ra_val;
+ int win, hits;
+
+ ra_val = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
+ }
+
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma || !vma_ra)
+ atomic_inc(&swapin_readahead_hits);
+ }
+ }
+
+ return page;
+}
+
+/**
+ * find_get_incore_page - Find and get a page from the page or swap caches.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
+ *
+ * This differs from find_get_page() in that it will also look for the
+ * page in the swap cache.
+ *
+ * Return: The found page or %NULL.
+ */
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ swp_entry_t swp;
+ struct swap_info_struct *si;
+ struct page *page = find_get_entry(mapping, index);
+
+ if (!page)
+ return page;
+ if (!xa_is_value(page))
+ return find_subpage(page, index);
+ if (!shmem_mapping(mapping))
+ return NULL;
+
+ swp = radix_to_swp_entry(page);
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(swp);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ put_swap_device(si);
+ return page;
+}
+
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated)
+{
+ struct swap_info_struct *si;
+ struct page *page;
+ void *shadow = NULL;
+
+ *new_page_allocated = false;
+
+ for (;;) {
+ int err;
+ /*
+ * First check the swap cache. Since this is normally
+ * called after lookup_swap_cache() failed, re-calling
+ * that would confuse statistics.
+ */
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
+ if (page)
+ return page;
+
+ /*
+ * Just skip read ahead for unused swap slot.
+ * During swap_off when swap_slot_cache is disabled,
+ * we have to handle the race between putting
+ * swap entry in swap cache and marking swap slot
+ * as SWAP_HAS_CACHE. That's done in later part of code or
+ * else swap_off will be aborted if we return NULL.
+ */
+ if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+ return NULL;
+
+ /*
+ * Get a new page to read into from swap. Allocate it now,
+ * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
+ * cause any racers to loop around until we add it to cache.
+ */
+ page = alloc_page_vma(gfp_mask, vma, addr);
+ if (!page)
+ return NULL;
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ err = swapcache_prepare(entry);
+ if (!err)
+ break;
+
+ put_page(page);
+ if (err != -EEXIST)
+ return NULL;
+
+ /*
+ * We might race against __delete_from_swap_cache(), and
+ * stumble across a swap_map entry whose SWAP_HAS_CACHE
+ * has not yet been cleared. Or race against another
+ * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
+ * in swap_map, but not yet added its page to swap cache.
+ */
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /*
+ * The swap entry is ours to swap in. Prepare the new page.
+ */
+
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
+ put_swap_page(page, entry);
+ goto fail_unlock;
+ }
+
+ if (mem_cgroup_charge(page, NULL, gfp_mask)) {
+ delete_from_swap_cache(page);
+ goto fail_unlock;
+ }
+
+ if (shadow)
+ workingset_refault(page, shadow);
+
+ /* Caller will initiate read into locked page */
+ SetPageWorkingset(page);
+ lru_cache_add(page);
+ *new_page_allocated = true;
+ return page;
+
+fail_unlock:
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+}
+
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+{
+ bool page_was_allocated;
+ struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+ vma, addr, &page_was_allocated);
+
+ if (page_was_allocated)
+ swap_readpage(retpage, do_poll);
+
+ return retpage;
+}
+
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+ unsigned long offset,
+ int hits,
+ int max_pages,
+ int prev_win)
+{
+ unsigned int pages, last_ra;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = hits + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = prev_win / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+
+ return pages;
+}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int hits, pages, max_pages;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << READ_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ hits = atomic_xchg(&swapin_readahead_hits, 0);
+ pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
+ max_pages,
+ atomic_read(&last_readahead_pages));
+ if (!hits)
+ WRITE_ONCE(prev_offset, offset);
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
+/**
+ * swap_cluster_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
+ */
+struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ struct page *page;
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
+ unsigned long start_offset, end_offset;
+ unsigned long mask;
+ struct swap_info_struct *si = swp_swap_info(entry);
+ struct blk_plug plug;
+ bool do_poll = true, page_allocated;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address;
+
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
+ /* Test swap type to make sure the dereference is safe */
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
+ struct inode *inode = si->swap_file->f_mapping->host;
+ if (inode_read_congested(inode))
+ goto skip;
+ }
+
+ do_poll = false;
+ /* Read a page_cluster sized and aligned cluster around offset. */
+ start_offset = offset & ~mask;
+ end_offset = offset | mask;
+ if (!start_offset) /* First page is swap header. */
+ start_offset++;
+ if (end_offset >= si->max)
+ end_offset = si->max - 1;
+
+ blk_start_plug(&plug);
+ for (offset = start_offset; offset <= end_offset ; offset++) {
+ /* Ok, do the async read-ahead now */
+ page = __read_swap_cache_async(
+ swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr, &page_allocated);
+ if (!page)
+ continue;
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (offset != entry_offset) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
+ put_page(page);
+ }
+ blk_finish_plug(&plug);
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+}
+
+int init_swap_address_space(unsigned int type, unsigned long nr_pages)
+{
+ struct address_space *spaces, *space;
+ unsigned int i, nr;
+
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
+ if (!spaces)
+ return -ENOMEM;
+ for (i = 0; i < nr; i++) {
+ space = spaces + i;
+ xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
+ atomic_set(&space->i_mmap_writable, 0);
+ space->a_ops = &swap_aops;
+ /* swap cache doesn't use writeback related tags */
+ mapping_set_no_writeback_tags(space);
+ }
+ nr_swapper_spaces[type] = nr;
+ swapper_spaces[type] = spaces;
+
+ return 0;
+}
+
+void exit_swap_address_space(unsigned int type)
+{
+ kvfree(swapper_spaces[type]);
+ nr_swapper_spaces[type] = 0;
+ swapper_spaces[type] = NULL;
+}
+
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+ unsigned long faddr,
+ unsigned long lpfn,
+ unsigned long rpfn,
+ unsigned long *start,
+ unsigned long *end)
+{
+ *start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ *end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+
+static void swap_ra_info(struct vm_fault *vmf,
+ struct vma_swap_readahead *ra_info)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long ra_val;
+ swp_entry_t entry;
+ unsigned long faddr, pfn, fpfn;
+ unsigned long start, end;
+ pte_t *pte, *orig_pte;
+ unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+ pte_t *tpte;
+#endif
+
+ max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
+ SWAP_RA_ORDER_CEILING);
+ if (max_win == 1) {
+ ra_info->win = 1;
+ return;
+ }
+
+ faddr = vmf->address;
+ orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
+ entry = pte_to_swp_entry(*pte);
+ if ((unlikely(non_swap_entry(entry)))) {
+ pte_unmap(orig_pte);
+ return;
+ }
+
+ fpfn = PFN_DOWN(faddr);
+ ra_val = GET_SWAP_RA_VAL(vma);
+ pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
+ prev_win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+ max_win, prev_win);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(faddr, win, 0));
+
+ if (win == 1) {
+ pte_unmap(orig_pte);
+ return;
+ }
+
+ /* Copy the PTEs because the page table may be unmapped */
+ if (fpfn == pfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+ else if (pfn == fpfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+ &start, &end);
+ else {
+ left = (win - 1) / 2;
+ swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+ &start, &end);
+ }
+ ra_info->nr_pte = end - start;
+ ra_info->offset = fpfn - start;
+ pte -= ra_info->offset;
+#ifdef CONFIG_64BIT
+ ra_info->ptes = pte;
+#else
+ tpte = ra_info->ptes;
+ for (pfn = start; pfn != end; pfn++)
+ *tpte++ = *pte++;
+#endif
+ pte_unmap(orig_pte);
+}
+
+/**
+ * swap_vma_readahead - swap in pages in hope we need them soon
+ * @fentry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read in a few pages whoes
+ * virtual addresses are around the fault address in the same vma.
+ *
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
+ *
+ */
+static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ struct blk_plug plug;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ pte_t *pte, pentry;
+ swp_entry_t entry;
+ unsigned int i;
+ bool page_allocated;
+ struct vma_swap_readahead ra_info = {0,};
+
+ swap_ra_info(vmf, &ra_info);
+ if (ra_info.win == 1)
+ goto skip;
+
+ blk_start_plug(&plug);
+ for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
+ i++, pte++) {
+ pentry = *pte;
+ if (pte_none(pentry))
+ continue;
+ if (pte_present(pentry))
+ continue;
+ entry = pte_to_swp_entry(pentry);
+ if (unlikely(non_swap_entry(entry)))
+ continue;
+ page = __read_swap_cache_async(entry, gfp_mask, vma,
+ vmf->address, &page_allocated);
+ if (!page)
+ continue;
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (i != ra_info.offset) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
+ put_page(page);
+ }
+ blk_finish_plug(&plug);
+ lru_add_drain();
+skip:
+ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+ ra_info.win == 1);
+}
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * It's a main entry function for swap readahead. By the configuration,
+ * it will read ahead blocks by cluster-based(ie, physical disk based)
+ * or vma-based(ie, virtual address based on faulty address) readahead.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ return swap_use_vma_readahead() ?
+ swap_vma_readahead(entry, gfp_mask, vmf) :
+ swap_cluster_readahead(entry, gfp_mask, vmf);
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t vma_ra_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
+}
+static ssize_t vma_ra_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ enable_vma_readahead = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ enable_vma_readahead = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+static struct kobj_attribute vma_ra_enabled_attr =
+ __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
+ vma_ra_enabled_store);
+
+static struct attribute *swap_attrs[] = {
+ &vma_ra_enabled_attr.attr,
+ NULL,
+};
+
+static struct attribute_group swap_attr_group = {
+ .attrs = swap_attrs,
+};
+
+static int __init swap_init_sysfs(void)
+{
+ int err;
+ struct kobject *swap_kobj;
+
+ swap_kobj = kobject_create_and_add("swap", mm_kobj);
+ if (!swap_kobj) {
+ pr_err("failed to create swap kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(swap_kobj, &swap_attr_group);
+ if (err) {
+ pr_err("failed to register swap group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(swap_kobj);
+ return err;
+}
+subsys_initcall(swap_init_sysfs);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
new file mode 100644
index 000000000..86ade667a
--- /dev/null
+++ b/mm/swapfile.c
@@ -0,0 +1,3862 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/swapfile.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/shmem_fs.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/writeback.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
+#include <linux/syscalls.h>
+#include <linux/memcontrol.h>
+#include <linux/poll.h>
+#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+#include <linux/export.h>
+#include <linux/swap_slots.h>
+#include <linux/sort.h>
+
+#include <asm/tlbflush.h>
+#include <linux/swapops.h>
+#include <linux/swap_cgroup.h>
+
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
+
+DEFINE_SPINLOCK(swap_lock);
+static unsigned int nr_swapfiles;
+atomic_long_t nr_swap_pages;
+/*
+ * Some modules use swappable objects and may try to swap them out under
+ * memory pressure (via the shrinker). Before doing so, they may wish to
+ * check to see if any swap space is available.
+ */
+EXPORT_SYMBOL_GPL(nr_swap_pages);
+/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+long total_swap_pages;
+static int least_priority = -1;
+
+static const char Bad_file[] = "Bad swap file entry ";
+static const char Unused_file[] = "Unused swap file entry ";
+static const char Bad_offset[] = "Bad swap offset entry ";
+static const char Unused_offset[] = "Unused swap offset entry ";
+
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static struct plist_head *swap_avail_heads;
+static DEFINE_SPINLOCK(swap_avail_lock);
+
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+static DEFINE_MUTEX(swapon_mutex);
+
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
+static struct swap_info_struct *swap_type_to_swap_info(int type)
+{
+ if (type >= READ_ONCE(nr_swapfiles))
+ return NULL;
+
+ smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
+ return READ_ONCE(swap_info[type]);
+}
+
+static inline unsigned char swap_count(unsigned char ent)
+{
+ return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
+}
+
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY 0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED 0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL 0x4
+
+/* returns 1 if swap entry is freed */
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
+{
+ swp_entry_t entry = swp_entry(si->type, offset);
+ struct page *page;
+ int ret = 0;
+
+ page = find_get_page(swap_address_space(entry), offset);
+ if (!page)
+ return 0;
+ /*
+ * When this function is called from scan_swap_map_slots() and it's
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * here. We have to use trylock for avoiding deadlock. This is a special
+ * case and you should use try_to_free_swap() with explicit lock_page()
+ * in usual operations.
+ */
+ if (trylock_page(page)) {
+ if ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+ ret = try_to_free_swap(page);
+ unlock_page(page);
+ }
+ put_page(page);
+ return ret;
+}
+
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
+ return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+ struct rb_node *rb = rb_next(&se->rb_node);
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
+/*
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static int discard_swap(struct swap_info_struct *si)
+{
+ struct swap_extent *se;
+ sector_t start_block;
+ sector_t nr_blocks;
+ int err = 0;
+
+ /* Do not discard the swap header page! */
+ se = first_se(si);
+ start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
+ nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
+ if (nr_blocks) {
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, 0);
+ if (err)
+ return err;
+ cond_resched();
+ }
+
+ for (se = next_se(se); se; se = next_se(se)) {
+ start_block = se->start_block << (PAGE_SHIFT - 9);
+ nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, 0);
+ if (err)
+ break;
+
+ cond_resched();
+ }
+ return err; /* That will often be -EOPNOTSUPP */
+}
+
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+ struct swap_extent *se;
+ struct rb_node *rb;
+
+ rb = sis->swap_extent_root.rb_node;
+ while (rb) {
+ se = rb_entry(rb, struct swap_extent, rb_node);
+ if (offset < se->start_page)
+ rb = rb->rb_left;
+ else if (offset >= se->start_page + se->nr_pages)
+ rb = rb->rb_right;
+ else
+ return se;
+ }
+ /* It *must* be present */
+ BUG();
+}
+
+sector_t swap_page_sector(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_extent *se;
+ sector_t sector;
+ pgoff_t offset;
+
+ offset = __page_file_index(page);
+ se = offset_to_swap_extent(sis, offset);
+ sector = se->start_block + (offset - se->start_page);
+ return sector << (PAGE_SHIFT - 9);
+}
+
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+ pgoff_t start_page, pgoff_t nr_pages)
+{
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
+
+ while (nr_pages) {
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO, 0))
+ break;
+
+ se = next_se(se);
+ }
+}
+
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+
+#define swap_entry_size(size) (size)
+#else
+#define SWAPFILE_CLUSTER 256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size) 1
+#endif
+#define LATENCY_LIMIT 256
+
+static inline void cluster_set_flag(struct swap_cluster_info *info,
+ unsigned int flag)
+{
+ info->flags = flag;
+}
+
+static inline unsigned int cluster_count(struct swap_cluster_info *info)
+{
+ return info->data;
+}
+
+static inline void cluster_set_count(struct swap_cluster_info *info,
+ unsigned int c)
+{
+ info->data = c;
+}
+
+static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+ unsigned int c, unsigned int f)
+{
+ info->flags = f;
+ info->data = c;
+}
+
+static inline unsigned int cluster_next(struct swap_cluster_info *info)
+{
+ return info->data;
+}
+
+static inline void cluster_set_next(struct swap_cluster_info *info,
+ unsigned int n)
+{
+ info->data = n;
+}
+
+static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+ unsigned int n, unsigned int f)
+{
+ info->flags = f;
+ info->data = n;
+}
+
+static inline bool cluster_is_free(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_FREE;
+}
+
+static inline bool cluster_is_null(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_NEXT_NULL;
+}
+
+static inline void cluster_set_null(struct swap_cluster_info *info)
+{
+ info->flags = CLUSTER_FLAG_NEXT_NULL;
+ info->data = 0;
+}
+
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ return info->flags & CLUSTER_FLAG_HUGE;
+ return false;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+ info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ ci = si->cluster_info;
+ if (ci) {
+ ci += offset / SWAPFILE_CLUSTER;
+ spin_lock(&ci->lock);
+ }
+ return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+ if (ci)
+ spin_unlock(&ci->lock);
+}
+
+/*
+ * Determine the locking method in use for this device. Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+ struct swap_info_struct *si, unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ /* Try to use fine-grained SSD-style locking if available: */
+ ci = lock_cluster(si, offset);
+ /* Otherwise, fall back to traditional, coarse locking: */
+ if (!ci)
+ spin_lock(&si->lock);
+
+ return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ if (ci)
+ unlock_cluster(ci);
+ else
+ spin_unlock(&si->lock);
+}
+
+static inline bool cluster_list_empty(struct swap_cluster_list *list)
+{
+ return cluster_is_null(&list->head);
+}
+
+static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
+{
+ return cluster_next(&list->head);
+}
+
+static void cluster_list_init(struct swap_cluster_list *list)
+{
+ cluster_set_null(&list->head);
+ cluster_set_null(&list->tail);
+}
+
+static void cluster_list_add_tail(struct swap_cluster_list *list,
+ struct swap_cluster_info *ci,
+ unsigned int idx)
+{
+ if (cluster_list_empty(list)) {
+ cluster_set_next_flag(&list->head, idx, 0);
+ cluster_set_next_flag(&list->tail, idx, 0);
+ } else {
+ struct swap_cluster_info *ci_tail;
+ unsigned int tail = cluster_next(&list->tail);
+
+ /*
+ * Nested cluster lock, but both cluster locks are
+ * only acquired when we held swap_info_struct->lock
+ */
+ ci_tail = ci + tail;
+ spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+ cluster_set_next(ci_tail, idx);
+ spin_unlock(&ci_tail->lock);
+ cluster_set_next_flag(&list->tail, idx, 0);
+ }
+}
+
+static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
+ struct swap_cluster_info *ci)
+{
+ unsigned int idx;
+
+ idx = cluster_next(&list->head);
+ if (cluster_next(&list->tail) == idx) {
+ cluster_set_null(&list->head);
+ cluster_set_null(&list->tail);
+ } else
+ cluster_set_next_flag(&list->head,
+ cluster_next(&ci[idx]), 0);
+
+ return idx;
+}
+
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ unsigned int idx)
+{
+ /*
+ * If scan_swap_map() can't find a free cluster, it will check
+ * si->swap_map directly. To make sure the discarding cluster isn't
+ * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+ * will be cleared after discard
+ */
+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+ cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
+
+ schedule_work(&si->discard_work);
+}
+
+static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+ cluster_list_add_tail(&si->free_clusters, ci, idx);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+ struct swap_cluster_info *info, *ci;
+ unsigned int idx;
+
+ info = si->cluster_info;
+
+ while (!cluster_list_empty(&si->discard_clusters)) {
+ idx = cluster_list_del_first(&si->discard_clusters, info);
+ spin_unlock(&si->lock);
+
+ discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+ SWAPFILE_CLUSTER);
+
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+ __free_cluster(si, idx);
+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ }
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(work, struct swap_info_struct, discard_work);
+
+ spin_lock(&si->lock);
+ swap_do_scheduled_discard(si);
+ spin_unlock(&si->lock);
+}
+
+static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info;
+
+ VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+ cluster_list_del_first(&si->free_clusters, ci);
+ cluster_set_count_flag(ci + idx, 0, 0);
+}
+
+static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ struct swap_cluster_info *ci = si->cluster_info + idx;
+
+ VM_BUG_ON(cluster_count(ci) != 0);
+ /*
+ * If the swap is discardable, prepare discard the cluster
+ * instead of free it immediately. The cluster will be freed
+ * after discard.
+ */
+ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+ swap_cluster_schedule_discard(si, idx);
+ return;
+ }
+
+ __free_cluster(si, idx);
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+ unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+ if (!cluster_info)
+ return;
+ if (cluster_is_free(&cluster_info[idx]))
+ alloc_cluster(p, idx);
+
+ VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+ cluster_set_count(&cluster_info[idx],
+ cluster_count(&cluster_info[idx]) + 1);
+}
+
+/*
+ * The cluster corresponding to page_nr decreases one usage. If the usage
+ * counter becomes 0, which means no page in the cluster is in using, we can
+ * optionally discard the cluster and add it to free cluster list.
+ */
+static void dec_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+ unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+ if (!cluster_info)
+ return;
+
+ VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+ cluster_set_count(&cluster_info[idx],
+ cluster_count(&cluster_info[idx]) - 1);
+
+ if (cluster_count(&cluster_info[idx]) == 0)
+ free_cluster(p, idx);
+}
+
+/*
+ * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * cluster list. Avoiding such abuse to avoid list corruption.
+ */
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct percpu_cluster *percpu_cluster;
+ bool conflict;
+
+ offset /= SWAPFILE_CLUSTER;
+ conflict = !cluster_list_empty(&si->free_clusters) &&
+ offset != cluster_list_first(&si->free_clusters) &&
+ cluster_is_free(&si->cluster_info[offset]);
+
+ if (!conflict)
+ return false;
+
+ percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+ cluster_set_null(&percpu_cluster->index);
+ return true;
+}
+
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ unsigned long *offset, unsigned long *scan_base)
+{
+ struct percpu_cluster *cluster;
+ struct swap_cluster_info *ci;
+ unsigned long tmp, max;
+
+new_cluster:
+ cluster = this_cpu_ptr(si->percpu_cluster);
+ if (cluster_is_null(&cluster->index)) {
+ if (!cluster_list_empty(&si->free_clusters)) {
+ cluster->index = si->free_clusters.head;
+ cluster->next = cluster_next(&cluster->index) *
+ SWAPFILE_CLUSTER;
+ } else if (!cluster_list_empty(&si->discard_clusters)) {
+ /*
+ * we don't have free cluster but have some clusters in
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
+ */
+ swap_do_scheduled_discard(si);
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
+ *offset = *scan_base;
+ goto new_cluster;
+ } else
+ return false;
+ }
+
+ /*
+ * Other CPUs can use our cluster if they can't find a free cluster,
+ * check if there is still free entry in the cluster
+ */
+ tmp = cluster->next;
+ max = min_t(unsigned long, si->max,
+ (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+ if (!si->swap_map[tmp])
+ break;
+ tmp++;
+ }
+ unlock_cluster(ci);
+ }
+ if (tmp >= max) {
+ cluster_set_null(&cluster->index);
+ goto new_cluster;
+ }
+ cluster->next = tmp + 1;
+ *offset = tmp;
+ *scan_base = tmp;
+ return true;
+}
+
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ assert_spin_locked(&p->lock);
+ for_each_node(nid)
+ plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+ spin_lock(&swap_avail_lock);
+ __del_from_avail_list(p);
+ spin_unlock(&swap_avail_lock);
+}
+
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned int end = offset + nr_entries - 1;
+
+ if (offset == si->lowest_bit)
+ si->lowest_bit += nr_entries;
+ if (end == si->highest_bit)
+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
+ si->inuse_pages += nr_entries;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ del_from_avail_list(si);
+ }
+}
+
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ spin_lock(&swap_avail_lock);
+ for_each_node(nid) {
+ WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+ }
+ spin_unlock(&swap_avail_lock);
+}
+
+static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries)
+{
+ unsigned long begin = offset;
+ unsigned long end = offset + nr_entries - 1;
+ void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+
+ if (offset < si->lowest_bit)
+ si->lowest_bit = offset;
+ if (end > si->highest_bit) {
+ bool was_full = !si->highest_bit;
+
+ WRITE_ONCE(si->highest_bit, end);
+ if (was_full && (si->flags & SWP_WRITEOK))
+ add_to_avail_list(si);
+ }
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ si->inuse_pages -= nr_entries;
+ if (si->flags & SWP_BLKDEV)
+ swap_slot_free_notify =
+ si->bdev->bd_disk->fops->swap_slot_free_notify;
+ else
+ swap_slot_free_notify = NULL;
+ while (offset <= end) {
+ arch_swap_invalidate_page(si->type, offset);
+ frontswap_invalidate_page(si->type, offset);
+ if (swap_slot_free_notify)
+ swap_slot_free_notify(si->bdev, offset);
+ offset++;
+ }
+ clear_shadow_from_swap_cache(si->type, begin, end);
+}
+
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+ unsigned long prev;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->cluster_next = next;
+ return;
+ }
+
+ prev = this_cpu_read(*si->cluster_next_cpu);
+ /*
+ * Cross the swap address space size aligned trunk, choose
+ * another trunk randomly to avoid lock contention on swap
+ * address space if possible.
+ */
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+ /* No free swap slots available */
+ if (si->highest_bit <= si->lowest_bit)
+ return;
+ next = si->lowest_bit +
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+ next = max_t(unsigned int, next, si->lowest_bit);
+ }
+ this_cpu_write(*si->cluster_next_cpu, next);
+}
+
+static int scan_swap_map_slots(struct swap_info_struct *si,
+ unsigned char usage, int nr,
+ swp_entry_t slots[])
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset;
+ unsigned long scan_base;
+ unsigned long last_in_cluster = 0;
+ int latency_ration = LATENCY_LIMIT;
+ int n_ret = 0;
+ bool scanned_many = false;
+
+ /*
+ * We try to cluster swap pages by allocating them sequentially
+ * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
+ * way, however, we resort to first-free allocation, starting
+ * a new cluster. This prevents us from scattering swap pages
+ * all over the entire swap partition, so that we reduce
+ * overall disk seek times between swap pages. -- sct
+ * But we do now try to find an empty cluster. -Andrea
+ * And we let swap pages go all over an SSD partition. Hugh
+ */
+
+ si->flags += SWP_SCANNING;
+ /*
+ * Use percpu scan base for SSD to reduce lock contention on
+ * cluster and swap cache. For HDD, sequential access is more
+ * important.
+ */
+ if (si->flags & SWP_SOLIDSTATE)
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
+ else
+ scan_base = si->cluster_next;
+ offset = scan_base;
+
+ /* SSD algorithm */
+ if (si->cluster_info) {
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto scan;
+ } else if (unlikely(!si->cluster_nr--)) {
+ if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ goto checks;
+ }
+
+ spin_unlock(&si->lock);
+
+ /*
+ * If seek is expensive, start searching for new cluster from
+ * start of partition, to minimize the span of allocated swap.
+ * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+ * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ */
+ scan_base = offset = si->lowest_bit;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+ /* Locate the first empty (unaligned) cluster */
+ for (; last_in_cluster <= si->highest_bit; offset++) {
+ if (si->swap_map[offset])
+ last_in_cluster = offset + SWAPFILE_CLUSTER;
+ else if (offset == last_in_cluster) {
+ spin_lock(&si->lock);
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+
+ offset = scan_base;
+ spin_lock(&si->lock);
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ }
+
+checks:
+ if (si->cluster_info) {
+ while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+ /* take a break if we already got some slots */
+ if (n_ret)
+ goto done;
+ if (!scan_swap_map_try_ssd_cluster(si, &offset,
+ &scan_base))
+ goto scan;
+ }
+ }
+ if (!(si->flags & SWP_WRITEOK))
+ goto no_page;
+ if (!si->highest_bit)
+ goto no_page;
+ if (offset > si->highest_bit)
+ scan_base = offset = si->lowest_bit;
+
+ ci = lock_cluster(si, offset);
+ /* reuse swap entry of cache-only swap if not busy. */
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ int swap_was_freed;
+ unlock_cluster(ci);
+ spin_unlock(&si->lock);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ spin_lock(&si->lock);
+ /* entry was freed successfully, try to use this again */
+ if (swap_was_freed)
+ goto checks;
+ goto scan; /* check next one */
+ }
+
+ if (si->swap_map[offset]) {
+ unlock_cluster(ci);
+ if (!n_ret)
+ goto scan;
+ else
+ goto done;
+ }
+ WRITE_ONCE(si->swap_map[offset], usage);
+ inc_cluster_info_page(si, si->cluster_info, offset);
+ unlock_cluster(ci);
+
+ swap_range_alloc(si, offset, 1);
+ slots[n_ret++] = swp_entry(si->type, offset);
+
+ /* got enough slots or reach max slots? */
+ if ((n_ret == nr) || (offset >= si->highest_bit))
+ goto done;
+
+ /* search for next available slot */
+
+ /* time to take a break? */
+ if (unlikely(--latency_ration < 0)) {
+ if (n_ret)
+ goto done;
+ spin_unlock(&si->lock);
+ cond_resched();
+ spin_lock(&si->lock);
+ latency_ration = LATENCY_LIMIT;
+ }
+
+ /* try to get more slots in cluster */
+ if (si->cluster_info) {
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto checks;
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ /* non-ssd case, still more slots in cluster? */
+ --si->cluster_nr;
+ goto checks;
+ }
+
+ /*
+ * Even if there's no free clusters available (fragmented),
+ * try to scan a little more quickly with lock held unless we
+ * have scanned too many slots already.
+ */
+ if (!scanned_many) {
+ unsigned long scan_limit;
+
+ if (offset < scan_base)
+ scan_limit = scan_base;
+ else
+ scan_limit = si->highest_bit;
+ for (; offset <= scan_limit && --latency_ration > 0;
+ offset++) {
+ if (!si->swap_map[offset])
+ goto checks;
+ }
+ }
+
+done:
+ set_cluster_next(si, offset + 1);
+ si->flags -= SWP_SCANNING;
+ return n_ret;
+
+scan:
+ spin_unlock(&si->lock);
+ while (++offset <= READ_ONCE(si->highest_bit)) {
+ if (data_race(!si->swap_map[offset])) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
+ }
+ }
+ offset = si->lowest_bit;
+ while (offset < scan_base) {
+ if (data_race(!si->swap_map[offset])) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
+ }
+ offset++;
+ }
+ spin_lock(&si->lock);
+
+no_page:
+ si->flags -= SWP_SCANNING;
+ return n_ret;
+}
+
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+ unsigned long idx;
+ struct swap_cluster_info *ci;
+ unsigned long offset, i;
+ unsigned char *map;
+
+ /*
+ * Should not even be attempting cluster allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+ VM_WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ if (cluster_list_empty(&si->free_clusters))
+ return 0;
+
+ idx = cluster_list_first(&si->free_clusters);
+ offset = idx * SWAPFILE_CLUSTER;
+ ci = lock_cluster(si, offset);
+ alloc_cluster(si, idx);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
+
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] = SWAP_HAS_CACHE;
+ unlock_cluster(ci);
+ swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+ *slot = swp_entry(si->type, offset);
+
+ return 1;
+}
+
+static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+ unsigned long offset = idx * SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
+ cluster_set_count_flag(ci, 0, 0);
+ free_cluster(si, idx);
+ unlock_cluster(ci);
+ swap_range_free(si, offset, SWAPFILE_CLUSTER);
+}
+
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
+{
+ swp_entry_t entry;
+ int n_ret;
+
+ n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+ if (n_ret)
+ return swp_offset(entry);
+ else
+ return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
+{
+ unsigned long size = swap_entry_size(entry_size);
+ struct swap_info_struct *si, *next;
+ long avail_pgs;
+ int n_ret = 0;
+ int node;
+
+ /* Only single cluster request supported */
+ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
+
+ spin_lock(&swap_avail_lock);
+
+ avail_pgs = atomic_long_read(&nr_swap_pages) / size;
+ if (avail_pgs <= 0) {
+ spin_unlock(&swap_avail_lock);
+ goto noswap;
+ }
+
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
+
+ atomic_long_sub(n_goal * size, &nr_swap_pages);
+
+start_over:
+ node = numa_node_id();
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+ spin_unlock(&swap_avail_lock);
+ spin_lock(&si->lock);
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_lists[node])) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ __del_from_avail_list(si);
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ if (size == SWAPFILE_CLUSTER) {
+ if (si->flags & SWP_BLKDEV)
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ } else
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries);
+ spin_unlock(&si->lock);
+ if (n_ret || size == SWAPFILE_CLUSTER)
+ goto check_out;
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ cond_resched();
+
+ spin_lock(&swap_avail_lock);
+nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over
+ * if we have not gotten any slots.
+ */
+ if (plist_node_empty(&next->avail_lists[node]))
+ goto start_over;
+ }
+
+ spin_unlock(&swap_avail_lock);
+
+check_out:
+ if (n_ret < n_goal)
+ atomic_long_add((long)(n_goal - n_ret) * size,
+ &nr_swap_pages);
+noswap:
+ return n_ret;
+}
+
+/* The only caller of this function is now suspend routine */
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+ pgoff_t offset;
+
+ if (!si)
+ goto fail;
+
+ spin_lock(&si->lock);
+ if (si->flags & SWP_WRITEOK) {
+ /* This is called for allocating swap entry, not cache */
+ offset = scan_swap_map(si, 1);
+ if (offset) {
+ atomic_long_dec(&nr_swap_pages);
+ spin_unlock(&si->lock);
+ return swp_entry(type, offset);
+ }
+ }
+ spin_unlock(&si->lock);
+fail:
+ return (swp_entry_t) {0};
+}
+
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+ unsigned long offset;
+
+ if (!entry.val)
+ goto out;
+ p = swp_swap_info(entry);
+ if (!p)
+ goto bad_nofile;
+ if (data_race(!(p->flags & SWP_USED)))
+ goto bad_device;
+ offset = swp_offset(entry);
+ if (offset >= p->max)
+ goto bad_offset;
+ return p;
+
+bad_offset:
+ pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
+ goto out;
+bad_device:
+ pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
+ goto out;
+bad_nofile:
+ pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
+out:
+ return NULL;
+}
+
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = __swap_info_get(entry);
+ if (!p)
+ goto out;
+ if (data_race(!p->swap_map[swp_offset(entry)]))
+ goto bad_free;
+ return p;
+
+bad_free:
+ pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+out:
+ return NULL;
+}
+
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+ if (p)
+ spin_lock(&p->lock);
+ return p;
+}
+
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+ struct swap_info_struct *q)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+
+ if (p != q) {
+ if (q != NULL)
+ spin_unlock(&q->lock);
+ if (p != NULL)
+ spin_lock(&p->lock);
+ }
+ return p;
+}
+
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
+{
+ unsigned char count;
+ unsigned char has_cache;
+
+ count = p->swap_map[offset];
+
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+
+ if (usage == SWAP_HAS_CACHE) {
+ VM_BUG_ON(!has_cache);
+ has_cache = 0;
+ } else if (count == SWAP_MAP_SHMEM) {
+ /*
+ * Or we could insist on shmem.c using a special
+ * swap_shmem_free() and free_shmem_swap_and_cache()...
+ */
+ count = 0;
+ } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+ if (count == COUNT_CONTINUED) {
+ if (swap_count_continued(p, offset, count))
+ count = SWAP_MAP_MAX | COUNT_CONTINUED;
+ else
+ count = SWAP_MAP_MAX;
+ } else
+ count--;
+ }
+
+ usage = count | has_cache;
+ if (usage)
+ WRITE_ONCE(p->swap_map[offset], usage);
+ else
+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
+
+ return usage;
+}
+
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc. The caller must
+ * be prepared for that. For example, the following situation is
+ * possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long offset;
+
+ if (!entry.val)
+ goto out;
+ si = swp_swap_info(entry);
+ if (!si)
+ goto bad_nofile;
+
+ rcu_read_lock();
+ if (data_race(!(si->flags & SWP_VALID)))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ rcu_read_unlock();
+ return NULL;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(p, offset);
+ usage = __swap_entry_free_locked(p, offset, 1);
+ unlock_cluster_or_swap_info(p, ci);
+ if (!usage)
+ free_swap_slot(entry);
+
+ return usage;
+}
+
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char count;
+
+ ci = lock_cluster(p, offset);
+ count = p->swap_map[offset];
+ VM_BUG_ON(count != SWAP_HAS_CACHE);
+ p->swap_map[offset] = 0;
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ unlock_cluster(ci);
+
+ mem_cgroup_uncharge_swap(entry, 1);
+ swap_range_free(p, offset, 1);
+}
+
+/*
+ * Caller has made sure that the swap device corresponding to entry
+ * is still around or has not been recycled.
+ */
+void swap_free(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+ if (p)
+ __swap_entry_free(p, entry);
+}
+
+/*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void put_swap_page(struct page *page, swp_entry_t entry)
+{
+ unsigned long offset = swp_offset(entry);
+ unsigned long idx = offset / SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned char *map;
+ unsigned int i, free_entries = 0;
+ unsigned char val;
+ int size = swap_entry_size(thp_nr_pages(page));
+
+ si = _swap_info_get(entry);
+ if (!si)
+ return;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (size == SWAPFILE_CLUSTER) {
+ VM_BUG_ON(!cluster_is_huge(ci));
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ cluster_clear_huge(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ return;
+ }
+ }
+ for (i = 0; i < size; i++, entry.val++) {
+ if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ unlock_cluster_or_swap_info(si, ci);
+ free_swap_slot(entry);
+ if (i == size - 1)
+ return;
+ lock_cluster_or_swap_info(si, offset);
+ }
+ }
+ unlock_cluster_or_swap_info(si, ci);
+}
+
+#ifdef CONFIG_THP_SWAP
+int split_swap_cluster(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = _swap_info_get(entry);
+ if (!si)
+ return -EBUSY;
+ ci = lock_cluster(si, offset);
+ cluster_clear_huge(ci);
+ unlock_cluster(ci);
+ return 0;
+}
+#endif
+
+static int swp_entry_cmp(const void *ent1, const void *ent2)
+{
+ const swp_entry_t *e1 = ent1, *e2 = ent2;
+
+ return (int)swp_type(*e1) - (int)swp_type(*e2);
+}
+
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+ struct swap_info_struct *p, *prev;
+ int i;
+
+ if (n <= 0)
+ return;
+
+ prev = NULL;
+ p = NULL;
+
+ /*
+ * Sort swap entries by swap device, so each lock is only taken once.
+ * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
+ * so low that it isn't necessary to optimize further.
+ */
+ if (nr_swapfiles > 1)
+ sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
+ for (i = 0; i < n; ++i) {
+ p = swap_info_get_cont(entries[i], prev);
+ if (p)
+ swap_entry_free(p, entries[i]);
+ prev = p;
+ }
+ if (p)
+ spin_unlock(&p->lock);
+}
+
+/*
+ * How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int page_swapcount(struct page *page)
+{
+ int count = 0;
+ struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
+ swp_entry_t entry;
+ unsigned long offset;
+
+ entry.val = page_private(page);
+ p = _swap_info_get(entry);
+ if (p) {
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(p, offset);
+ count = swap_count(p->swap_map[offset]);
+ unlock_cluster_or_swap_info(p, ci);
+ }
+ return count;
+}
+
+int __swap_count(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset = swp_offset(entry);
+ int count = 0;
+
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
+}
+
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
+ return count;
+}
+
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+ int count = 0;
+ struct swap_info_struct *si;
+
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
+ return count;
+}
+
+/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = _swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+
+ count = swap_count(p->swap_map[offset]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_next_entry(page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ unlock_cluster_or_swap_info(p, ci);
+ return count;
+}
+
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
+ unsigned long roffset = swp_offset(entry);
+ unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+ int i;
+ bool ret = false;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (!ci || !cluster_is_huge(ci)) {
+ if (swap_count(map[roffset]))
+ ret = true;
+ goto unlock_out;
+ }
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (swap_count(map[offset + i])) {
+ ret = true;
+ break;
+ }
+ }
+unlock_out:
+ unlock_cluster_or_swap_info(si, ci);
+ return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
+ return page_swapcount(page) != 0;
+
+ page = compound_head(page);
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si)
+ return swap_page_trans_huge_swapped(si, entry);
+ return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int i, map_swapcount, _total_mapcount, _total_swapcount;
+ unsigned long offset = 0;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci = NULL;
+ unsigned char *map = NULL;
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = _total_swapcount = map_swapcount = 0;
+ if (PageSwapCache(page)) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si) {
+ map = si->swap_map;
+ offset = swp_offset(entry);
+ }
+ }
+ if (map)
+ ci = lock_cluster(si, offset);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ _total_mapcount += mapcount;
+ if (map) {
+ swapcount = swap_count(map[offset + i]);
+ _total_swapcount += swapcount;
+ }
+ map_swapcount = max(map_swapcount, mapcount + swapcount);
+ }
+ unlock_cluster(ci);
+ if (PageDoubleMap(page)) {
+ map_swapcount -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ map_swapcount += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ if (total_swapcount)
+ *total_swapcount = _total_swapcount;
+
+ return map_swapcount;
+}
+
+/*
+ * We can write to an anon page without COW if there are no other references
+ * to it. And as a side-effect, free up its swap: because the old content
+ * on disk will never be read, and seeking back there to write new content
+ * later would only waste time away from clustering.
+ *
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
+ */
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
+{
+ int count, total_mapcount, total_swapcount;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page)))
+ return false;
+ count = page_trans_huge_map_swapcount(page, &total_mapcount,
+ &total_swapcount);
+ if (total_map_swapcount)
+ *total_map_swapcount = total_mapcount + total_swapcount;
+ if (count == 1 && PageSwapCache(page) &&
+ (likely(!PageTransCompound(page)) ||
+ /* The remaining swap count will be freed soon */
+ total_swapcount == page_swapcount(page))) {
+ if (!PageWriteback(page)) {
+ page = compound_head(page);
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ } else {
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p->flags & SWP_STABLE_WRITES) {
+ spin_unlock(&p->lock);
+ return false;
+ }
+ spin_unlock(&p->lock);
+ }
+ }
+
+ return count <= 1;
+}
+
+/*
+ * If swap is getting full, or if there are no more mappings of this page,
+ * then try_to_free_swap is called to free its swap space.
+ */
+int try_to_free_swap(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (!PageSwapCache(page))
+ return 0;
+ if (PageWriteback(page))
+ return 0;
+ if (page_swapped(page))
+ return 0;
+
+ /*
+ * Once hibernation has begun to create its image of memory,
+ * there's a danger that one of the calls to try_to_free_swap()
+ * - most probably a call from __try_to_reclaim_swap() while
+ * hibernation is allocating its own swap pages for the image,
+ * but conceivably even a call from memory reclaim - will free
+ * the swap from a page which has already been recorded in the
+ * image as a clean swapcache page, and then reuse its swap for
+ * another page of the image. On waking from hibernation, the
+ * original page might be freed under memory pressure, then
+ * later read back in from swap, now with the wrong data.
+ *
+ * Hibernation suspends storage while it is writing the image
+ * to disk so check that here.
+ */
+ if (pm_suspended_storage())
+ return 0;
+
+ page = compound_head(page);
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ return 1;
+}
+
+/*
+ * Free the swap entry like above, but also try to
+ * free the page cache entry if it is the last user.
+ */
+int free_swap_and_cache(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+ unsigned char count;
+
+ if (non_swap_entry(entry))
+ return 1;
+
+ p = _swap_info_get(entry);
+ if (p) {
+ count = __swap_entry_free(p, entry);
+ if (count == SWAP_HAS_CACHE &&
+ !swap_page_trans_huge_swapped(p, entry))
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
+ }
+ return p != NULL;
+}
+
+#ifdef CONFIG_HIBERNATION
+/*
+ * Find the swap type that corresponds to given device (if any).
+ *
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
+ */
+int swap_type_of(dev_t device, sector_t offset)
+{
+ int type;
+
+ if (!device)
+ return -1;
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+
+ if (device == sis->bdev->bd_dev) {
+ struct swap_extent *se = first_se(sis);
+
+ if (se->start_block == offset) {
+ spin_unlock(&swap_lock);
+ return type;
+ }
+ }
+ }
+ spin_unlock(&swap_lock);
+ return -ENODEV;
+}
+
+int find_first_swap(dev_t *device)
+{
+ int type;
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+ *device = sis->bdev->bd_dev;
+ spin_unlock(&swap_lock);
+ return type;
+ }
+ spin_unlock(&swap_lock);
+ return -ENODEV;
+}
+
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+ struct block_device *bdev;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
+
+ if (!si || !(si->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_entry(swp_entry(type, offset), &bdev);
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+ unsigned int n = 0;
+
+ spin_lock(&swap_lock);
+ if ((unsigned int)type < nr_swapfiles) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ spin_lock(&sis->lock);
+ if (sis->flags & SWP_WRITEOK) {
+ n = sis->pages;
+ if (free)
+ n -= sis->inuse_pages;
+ }
+ spin_unlock(&sis->lock);
+ }
+ spin_unlock(&swap_lock);
+ return n;
+}
+#endif /* CONFIG_HIBERNATION */
+
+static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
+{
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
+}
+
+/*
+ * No need to decide whether this PTE shares the swap entry with others,
+ * just let do_wp_page work it out if a write is requested later - to
+ * force COW, vm_page_prot omits write permission from any private vma.
+ */
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, swp_entry_t entry, struct page *page)
+{
+ struct page *swapcache;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret = 1;
+
+ swapcache = page;
+ page = ksm_might_need_to_copy(page, vma, addr);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ ret = 0;
+ goto out;
+ }
+
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ get_page(page);
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ if (page == swapcache) {
+ page_add_anon_rmap(page, vma, addr, false);
+ } else { /* ksm created a completely new copy */
+ page_add_new_anon_rmap(page, vma, addr, false);
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ }
+ swap_free(entry);
+out:
+ pte_unmap_unlock(pte, ptl);
+ if (page != swapcache) {
+ unlock_page(page);
+ put_page(page);
+ }
+ return ret;
+}
+
+static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ struct page *page;
+ swp_entry_t entry;
+ pte_t *pte;
+ struct swap_info_struct *si;
+ unsigned long offset;
+ int ret = 0;
+ volatile unsigned char *swap_map;
+
+ si = swap_info[type];
+ pte = pte_offset_map(pmd, addr);
+ do {
+ struct vm_fault vmf;
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ entry = pte_to_swp_entry(*pte);
+ if (swp_type(entry) != type)
+ continue;
+
+ offset = swp_offset(entry);
+ if (frontswap && !frontswap_test(si, offset))
+ continue;
+
+ pte_unmap(pte);
+ swap_map = &si->swap_map[offset];
+ page = lookup_swap_cache(entry, vma, addr);
+ if (!page) {
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ &vmf);
+ }
+ if (!page) {
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ goto try_next;
+ return -ENOMEM;
+ }
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
+
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ goto out;
+ }
+try_next:
+ pte = pte_offset_map(pmd, addr);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(pte - 1);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int ret;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ cond_resched();
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ continue;
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
+ if (ret)
+ return ret;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ pud_t *pud;
+ unsigned long next;
+ int ret;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
+ frontswap, fs_pages_to_unuse);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int ret;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
+ frontswap, fs_pages_to_unuse);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
+{
+ pgd_t *pgd;
+ unsigned long addr, end, next;
+ int ret;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+ return 0;
+}
+
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
+{
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ mmap_read_lock(mm);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma) {
+ ret = unuse_vma(vma, type, frontswap,
+ fs_pages_to_unuse);
+ if (ret)
+ break;
+ }
+ cond_resched();
+ }
+ mmap_read_unlock(mm);
+ return ret;
+}
+
+/*
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use. Return 0
+ * if there are no inuse entries after prev till end of the map.
+ */
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+ unsigned int prev, bool frontswap)
+{
+ unsigned int i;
+ unsigned char count;
+
+ /*
+ * No need for swap_lock here: we're just looking
+ * for whether an entry is in use, not modifying it; false
+ * hits are okay, and sys_swapoff() has already prevented new
+ * allocations from this area (while holding swap_lock).
+ */
+ for (i = prev + 1; i < si->max; i++) {
+ count = READ_ONCE(si->swap_map[i]);
+ if (count && swap_count(count) != SWAP_MAP_BAD)
+ if (!frontswap || frontswap_test(si, i))
+ break;
+ if ((i % LATENCY_LIMIT) == 0)
+ cond_resched();
+ }
+
+ if (i == si->max)
+ i = 0;
+
+ return i;
+}
+
+/*
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
+ */
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
+{
+ struct mm_struct *prev_mm;
+ struct mm_struct *mm;
+ struct list_head *p;
+ int retval = 0;
+ struct swap_info_struct *si = swap_info[type];
+ struct page *page;
+ swp_entry_t entry;
+ unsigned int i;
+
+ if (!READ_ONCE(si->inuse_pages))
+ return 0;
+
+ if (!frontswap)
+ pages_to_unuse = 0;
+
+retry:
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ if (retval)
+ goto out;
+
+ prev_mm = &init_mm;
+ mmget(prev_mm);
+
+ spin_lock(&mmlist_lock);
+ p = &init_mm.mmlist;
+ while (READ_ONCE(si->inuse_pages) &&
+ !signal_pending(current) &&
+ (p = p->next) != &init_mm.mmlist) {
+
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!mmget_not_zero(mm))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
+
+ if (retval) {
+ mmput(prev_mm);
+ goto out;
+ }
+
+ /*
+ * Make sure that we aren't completely killing
+ * interactive performance.
+ */
+ cond_resched();
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+
+ mmput(prev_mm);
+
+ i = 0;
+ while (READ_ONCE(si->inuse_pages) &&
+ !signal_pending(current) &&
+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
+
+ entry = swp_entry(type, i);
+ page = find_get_page(swap_address_space(entry), i);
+ if (!page)
+ continue;
+
+ /*
+ * It is conceivable that a racing task removed this page from
+ * swap cache just before we acquired the page lock. The page
+ * might even be back in swap cache on another swap area. But
+ * that is okay, try_to_free_swap() only removes stale pages.
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ /*
+ * For frontswap, we just need to unuse pages_to_unuse, if
+ * it was specified. Need not check frontswap again here as
+ * we already zeroed out pages_to_unuse if not frontswap.
+ */
+ if (pages_to_unuse && --pages_to_unuse == 0)
+ goto out;
+ }
+
+ /*
+ * Lets check again to see if there are still swap entries in the map.
+ * If yes, we would need to do retry the unuse logic again.
+ * Under global memory pressure, swap entries can be reinserted back
+ * into process space after the mmlist loop above passes over them.
+ *
+ * Limit the number of retries? No: when mmget_not_zero() above fails,
+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
+ * at its own independent pace; and even shmem_writepage() could have
+ * been preempted after get_swap_page(), temporarily hiding that swap.
+ * It's easy and robust (though cpu-intensive) just to keep retrying.
+ */
+ if (READ_ONCE(si->inuse_pages)) {
+ if (!signal_pending(current))
+ goto retry;
+ retval = -EINTR;
+ }
+out:
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
+}
+
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist. swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+ struct list_head *p, *next;
+ unsigned int type;
+
+ for (type = 0; type < nr_swapfiles; type++)
+ if (swap_info[type]->inuse_pages)
+ return;
+ spin_lock(&mmlist_lock);
+ list_for_each_safe(p, next, &init_mm.mmlist)
+ list_del_init(p);
+ spin_unlock(&mmlist_lock);
+}
+
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
+ */
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
+{
+ struct swap_info_struct *sis;
+ struct swap_extent *se;
+ pgoff_t offset;
+
+ sis = swp_swap_info(entry);
+ *bdev = sis->bdev;
+
+ offset = swp_offset(entry);
+ se = offset_to_swap_extent(sis, offset);
+ return se->start_block + (offset - se->start_page);
+}
+
+/*
+ * Returns the page offset into bdev for the specified page's swap entry.
+ */
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
+{
+ swp_entry_t entry;
+ entry.val = page_private(page);
+ return map_swap_entry(entry, bdev);
+}
+
+/*
+ * Free all of a swapdev's extent information
+ */
+static void destroy_swap_extents(struct swap_info_struct *sis)
+{
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
+
+ rb_erase(rb, &sis->swap_extent_root);
+ kfree(se);
+ }
+
+ if (sis->flags & SWP_ACTIVATED) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ sis->flags &= ~SWP_ACTIVATED;
+ if (mapping->a_ops->swap_deactivate)
+ mapping->a_ops->swap_deactivate(swap_file);
+ }
+}
+
+/*
+ * Add a block range (and the corresponding page range) into this swapdev's
+ * extent tree.
+ *
+ * This function rather assumes that it is called in ascending page order.
+ */
+int
+add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
+ unsigned long nr_pages, sector_t start_block)
+{
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
+ struct swap_extent *se;
+ struct swap_extent *new_se;
+
+ /*
+ * place the new node at the right most since the
+ * function is called in ascending page order.
+ */
+ while (*link) {
+ parent = *link;
+ link = &parent->rb_right;
+ }
+
+ if (parent) {
+ se = rb_entry(parent, struct swap_extent, rb_node);
+ BUG_ON(se->start_page + se->nr_pages != start_page);
+ if (se->start_block + se->nr_pages == start_block) {
+ /* Merge it */
+ se->nr_pages += nr_pages;
+ return 0;
+ }
+ }
+
+ /* No merge, insert a new extent. */
+ new_se = kmalloc(sizeof(*se), GFP_KERNEL);
+ if (new_se == NULL)
+ return -ENOMEM;
+ new_se->start_page = start_page;
+ new_se->nr_pages = nr_pages;
+ new_se->start_block = start_block;
+
+ rb_link_node(&new_se->rb_node, parent, link);
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(add_swap_extent);
+
+/*
+ * A `swap extent' is a simple thing which maps a contiguous range of pages
+ * onto a contiguous range of disk blocks. An ordered list of swap extents
+ * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * time for locating where on disk a page belongs.
+ *
+ * If the swapfile is an S_ISBLK block device, a single extent is installed.
+ * This is done so that the main operating code can treat S_ISBLK and S_ISREG
+ * swap files identically.
+ *
+ * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
+ * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * swapfiles are handled *identically* after swapon time.
+ *
+ * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
+ * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
+ * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * requirements, they are simply tossed out - we will never use those blocks
+ * for swapping.
+ *
+ * For all swap devices we set S_SWAPFILE across the life of the swapon. This
+ * prevents users from writing to the swap device, which will corrupt memory.
+ *
+ * The amount of disk space which a single swap extent represents varies.
+ * Typically it is in the 1-4 megabyte range. So we can have hundreds of
+ * extents in the list. To avoid much list walking, we cache the previous
+ * search location in `curr_swap_extent', and start new searches from there.
+ * This is extremely effective. The average number of iterations in
+ * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ */
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
+{
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ int ret;
+
+ if (S_ISBLK(inode->i_mode)) {
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
+ return ret;
+ }
+
+ if (mapping->a_ops->swap_activate) {
+ ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (ret >= 0)
+ sis->flags |= SWP_ACTIVATED;
+ if (!ret) {
+ sis->flags |= SWP_FS_OPS;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
+ }
+ return ret;
+ }
+
+ return generic_swapfile_activate(sis, swap_file, span);
+}
+
+static int swap_node(struct swap_info_struct *p)
+{
+ struct block_device *bdev;
+
+ if (p->bdev)
+ bdev = p->bdev;
+ else
+ bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+ return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
+{
+ int i;
+
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ for_each_node(i) {
+ if (p->prio >= 0)
+ p->avail_lists[i].prio = -p->prio;
+ else {
+ if (swap_node(p) == i)
+ p->avail_lists[i].prio = 1;
+ else
+ p->avail_lists[i].prio = -p->prio;
+ }
+ }
+ p->swap_map = swap_map;
+ p->cluster_info = cluster_info;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
+ atomic_long_add(p->pages, &nr_swap_pages);
+ total_swap_pages += p->pages;
+
+ assert_spin_locked(&swap_lock);
+ /*
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
+ */
+ plist_add(&p->list, &swap_active_head);
+ add_to_avail_list(p);
+}
+
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info,
+ unsigned long *frontswap_map)
+{
+ frontswap_init(p->type, frontswap_map);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are valid
+ * between get/put_swap_device() if SWP_VALID bit is set
+ */
+ synchronize_rcu();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+}
+
+static void reinsert_swap_info(struct swap_info_struct *p)
+{
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+}
+
+bool has_usable_swap(void)
+{
+ bool ret = true;
+
+ spin_lock(&swap_lock);
+ if (plist_head_empty(&swap_active_head))
+ ret = false;
+ spin_unlock(&swap_lock);
+ return ret;
+}
+
+SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
+{
+ struct swap_info_struct *p = NULL;
+ unsigned char *swap_map;
+ struct swap_cluster_info *cluster_info;
+ unsigned long *frontswap_map;
+ struct file *swap_file, *victim;
+ struct address_space *mapping;
+ struct inode *inode;
+ struct filename *pathname;
+ int err, found = 0;
+ unsigned int old_block_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ BUG_ON(!current->mm);
+
+ pathname = getname(specialfile);
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+
+ victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
+ err = PTR_ERR(victim);
+ if (IS_ERR(victim))
+ goto out;
+
+ mapping = victim->f_mapping;
+ spin_lock(&swap_lock);
+ plist_for_each_entry(p, &swap_active_head, list) {
+ if (p->flags & SWP_WRITEOK) {
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ err = -EINVAL;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+ if (!security_vm_enough_memory_mm(current->mm, p->pages))
+ vm_unacct_memory(p->pages);
+ else {
+ err = -ENOMEM;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+ spin_lock(&p->lock);
+ del_from_avail_list(p);
+ if (p->prio < 0) {
+ struct swap_info_struct *si = p;
+ int nid;
+
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
+ si->list.prio--;
+ for_each_node(nid) {
+ if (si->avail_lists[nid].prio != 1)
+ si->avail_lists[nid].prio--;
+ }
+ }
+ least_priority++;
+ }
+ plist_del(&p->list, &swap_active_head);
+ atomic_long_sub(p->pages, &nr_swap_pages);
+ total_swap_pages -= p->pages;
+ p->flags &= ~SWP_WRITEOK;
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+
+ disable_swap_slots_cache_lock();
+
+ set_current_oom_origin();
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ clear_current_oom_origin();
+
+ if (err) {
+ /* re-insert swap space back into swap_list */
+ reinsert_swap_info(p);
+ reenable_swap_slots_cache_unlock();
+ goto out_dput;
+ }
+
+ reenable_swap_slots_cache_unlock();
+
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ synchronize_rcu();
+
+ flush_work(&p->discard_work);
+
+ destroy_swap_extents(p);
+ if (p->flags & SWP_CONTINUED)
+ free_swap_count_continuations(p);
+
+ if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ atomic_dec(&nr_rotate_swap);
+
+ mutex_lock(&swapon_mutex);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ drain_mmlist();
+
+ /* wait for anyone still in scan_swap_map */
+ p->highest_bit = 0; /* cuts scans short */
+ while (p->flags >= SWP_SCANNING) {
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ }
+
+ swap_file = p->swap_file;
+ old_block_size = p->old_block_size;
+ p->swap_file = NULL;
+ p->max = 0;
+ swap_map = p->swap_map;
+ p->swap_map = NULL;
+ cluster_info = p->cluster_info;
+ p->cluster_info = NULL;
+ frontswap_map = frontswap_map_get(p);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ arch_swap_invalidate_area(p->type);
+ frontswap_invalidate_area(p->type);
+ frontswap_map_set(p, NULL);
+ mutex_unlock(&swapon_mutex);
+ free_percpu(p->percpu_cluster);
+ p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
+ vfree(swap_map);
+ kvfree(cluster_info);
+ kvfree(frontswap_map);
+ /* Destroy swap account information */
+ swap_cgroup_swapoff(p->type);
+ exit_swap_address_space(p->type);
+
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+ struct block_device *bdev = I_BDEV(inode);
+
+ set_blocksize(bdev, old_block_size);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ }
+
+ inode_lock(inode);
+ inode->i_flags &= ~S_SWAPFILE;
+ inode_unlock(inode);
+ filp_close(swap_file, NULL);
+
+ /*
+ * Clear the SWP_USED flag after all resources are freed so that swapon
+ * can reuse this swap_info in alloc_swap_info() safely. It is ok to
+ * not hold p->lock after we cleared its SWP_WRITEOK.
+ */
+ spin_lock(&swap_lock);
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+
+ err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
+out_dput:
+ filp_close(victim, NULL);
+out:
+ putname(pathname);
+ return err;
+}
+
+#ifdef CONFIG_PROC_FS
+static __poll_t swaps_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+
+ poll_wait(file, &proc_poll_wait, wait);
+
+ if (seq->poll_event != atomic_read(&proc_poll_event)) {
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
+ }
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
+/* iterator */
+static void *swap_start(struct seq_file *swap, loff_t *pos)
+{
+ struct swap_info_struct *si;
+ int type;
+ loff_t l = *pos;
+
+ mutex_lock(&swapon_mutex);
+
+ if (!l)
+ return SEQ_START_TOKEN;
+
+ for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
+ if (!(si->flags & SWP_USED) || !si->swap_map)
+ continue;
+ if (!--l)
+ return si;
+ }
+
+ return NULL;
+}
+
+static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
+{
+ struct swap_info_struct *si = v;
+ int type;
+
+ if (v == SEQ_START_TOKEN)
+ type = 0;
+ else
+ type = si->type + 1;
+
+ ++(*pos);
+ for (; (si = swap_type_to_swap_info(type)); type++) {
+ if (!(si->flags & SWP_USED) || !si->swap_map)
+ continue;
+ return si;
+ }
+
+ return NULL;
+}
+
+static void swap_stop(struct seq_file *swap, void *v)
+{
+ mutex_unlock(&swapon_mutex);
+}
+
+static int swap_show(struct seq_file *swap, void *v)
+{
+ struct swap_info_struct *si = v;
+ struct file *file;
+ int len;
+ unsigned int bytes, inuse;
+
+ if (si == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
+ return 0;
+ }
+
+ bytes = si->pages << (PAGE_SHIFT - 10);
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
+ file = si->swap_file;
+ len = seq_file_path(swap, file, " \t\n\\");
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
+ len < 40 ? 40 - len : 1, " ",
+ S_ISBLK(file_inode(file)->i_mode) ?
+ "partition" : "file\t",
+ bytes, bytes < 10000000 ? "\t" : "",
+ inuse, inuse < 10000000 ? "\t" : "",
+ si->prio);
+ return 0;
+}
+
+static const struct seq_operations swaps_op = {
+ .start = swap_start,
+ .next = swap_next,
+ .stop = swap_stop,
+ .show = swap_show
+};
+
+static int swaps_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &swaps_op);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return 0;
+}
+
+static const struct proc_ops swaps_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = swaps_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+ .proc_poll = swaps_poll,
+};
+
+static int __init procswaps_init(void)
+{
+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
+ return 0;
+}
+__initcall(procswaps_init);
+#endif /* CONFIG_PROC_FS */
+
+#ifdef MAX_SWAPFILES_CHECK
+static int __init max_swapfiles_check(void)
+{
+ MAX_SWAPFILES_CHECK();
+ return 0;
+}
+late_initcall(max_swapfiles_check);
+#endif
+
+static struct swap_info_struct *alloc_swap_info(void)
+{
+ struct swap_info_struct *p;
+ struct swap_info_struct *defer = NULL;
+ unsigned int type;
+ int i;
+
+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ if (!(swap_info[type]->flags & SWP_USED))
+ break;
+ }
+ if (type >= MAX_SWAPFILES) {
+ spin_unlock(&swap_lock);
+ kvfree(p);
+ return ERR_PTR(-EPERM);
+ }
+ if (type >= nr_swapfiles) {
+ p->type = type;
+ WRITE_ONCE(swap_info[type], p);
+ /*
+ * Write swap_info[type] before nr_swapfiles, in case a
+ * racing procfs swap_start() or swap_next() is reading them.
+ * (We never shrink nr_swapfiles, we never free this entry.)
+ */
+ smp_wmb();
+ WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+ } else {
+ defer = p;
+ p = swap_info[type];
+ /*
+ * Do not memset this entry: a racing procfs swap_next()
+ * would be relying on p->type to remain valid.
+ */
+ }
+ p->swap_extent_root = RB_ROOT;
+ plist_node_init(&p->list, 0);
+ for_each_node(i)
+ plist_node_init(&p->avail_lists[i], 0);
+ p->flags = SWP_USED;
+ spin_unlock(&swap_lock);
+ kvfree(defer);
+ spin_lock_init(&p->lock);
+ spin_lock_init(&p->cont_lock);
+
+ return p;
+}
+
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+{
+ int error;
+
+ if (S_ISBLK(inode->i_mode)) {
+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
+ if (IS_ERR(p->bdev)) {
+ error = PTR_ERR(p->bdev);
+ p->bdev = NULL;
+ return error;
+ }
+ p->old_block_size = block_size(p->bdev);
+ error = set_blocksize(p->bdev, PAGE_SIZE);
+ if (error < 0)
+ return error;
+ /*
+ * Zoned block devices contain zones that have a sequential
+ * write only restriction. Hence zoned block devices are not
+ * suitable for swapping. Disallow them here.
+ */
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
+ return -EINVAL;
+ p->flags |= SWP_BLKDEV;
+ } else if (S_ISREG(inode->i_mode)) {
+ p->bdev = inode->i_sb->s_bdev;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+ return swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+ return generic_max_swapfile_size();
+}
+
+static unsigned long read_swap_header(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ struct inode *inode)
+{
+ int i;
+ unsigned long maxpages;
+ unsigned long swapfilepages;
+ unsigned long last_page;
+
+ if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
+ pr_err("Unable to find swap-space signature\n");
+ return 0;
+ }
+
+ /* swap partition endianess hack... */
+ if (swab32(swap_header->info.version) == 1) {
+ swab32s(&swap_header->info.version);
+ swab32s(&swap_header->info.last_page);
+ swab32s(&swap_header->info.nr_badpages);
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ return 0;
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ swab32s(&swap_header->info.badpages[i]);
+ }
+ /* Check the swap header's sub-version */
+ if (swap_header->info.version != 1) {
+ pr_warn("Unable to handle swap header version %d\n",
+ swap_header->info.version);
+ return 0;
+ }
+
+ p->lowest_bit = 1;
+ p->cluster_next = 1;
+ p->cluster_nr = 0;
+
+ maxpages = max_swapfile_size();
+ last_page = swap_header->info.last_page;
+ if (!last_page) {
+ pr_warn("Empty swap-file\n");
+ return 0;
+ }
+ if (last_page > maxpages) {
+ pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
+ maxpages << (PAGE_SHIFT - 10),
+ last_page << (PAGE_SHIFT - 10));
+ }
+ if (maxpages > last_page) {
+ maxpages = last_page + 1;
+ /* p->max is an unsigned int: don't overflow it */
+ if ((unsigned int)maxpages == 0)
+ maxpages = UINT_MAX;
+ }
+ p->highest_bit = maxpages - 1;
+
+ if (!maxpages)
+ return 0;
+ swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+ if (swapfilepages && maxpages > swapfilepages) {
+ pr_warn("Swap area shorter than signature indicates\n");
+ return 0;
+ }
+ if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+ return 0;
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ return 0;
+
+ return maxpages;
+}
+
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
+static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info,
+ unsigned long maxpages,
+ sector_t *span)
+{
+ unsigned int j, k;
+ unsigned int nr_good_pages;
+ int nr_extents;
+ unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+ unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+ unsigned long i, idx;
+
+ nr_good_pages = maxpages - 1; /* omit header page */
+
+ cluster_list_init(&p->free_clusters);
+ cluster_list_init(&p->discard_clusters);
+
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+ if (page_nr == 0 || page_nr > swap_header->info.last_page)
+ return -EINVAL;
+ if (page_nr < maxpages) {
+ swap_map[page_nr] = SWAP_MAP_BAD;
+ nr_good_pages--;
+ /*
+ * Haven't marked the cluster free yet, no list
+ * operation involved
+ */
+ inc_cluster_info_page(p, cluster_info, page_nr);
+ }
+ }
+
+ /* Haven't marked the cluster free yet, no list operation involved */
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+ inc_cluster_info_page(p, cluster_info, i);
+
+ if (nr_good_pages) {
+ swap_map[0] = SWAP_MAP_BAD;
+ /*
+ * Not mark the cluster free yet, no list
+ * operation involved
+ */
+ inc_cluster_info_page(p, cluster_info, 0);
+ p->max = maxpages;
+ p->pages = nr_good_pages;
+ nr_extents = setup_swap_extents(p, span);
+ if (nr_extents < 0)
+ return nr_extents;
+ nr_good_pages = p->pages;
+ }
+ if (!nr_good_pages) {
+ pr_warn("Empty swap-file\n");
+ return -EINVAL;
+ }
+
+ if (!cluster_info)
+ return nr_extents;
+
+
+ /*
+ * Reduce false cache line sharing between cluster_info and
+ * sharing same address space.
+ */
+ for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ j = (k + col) % SWAP_CLUSTER_COLS;
+ for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+ idx = i * SWAP_CLUSTER_COLS + j;
+ if (idx >= nr_clusters)
+ continue;
+ if (cluster_count(&cluster_info[idx]))
+ continue;
+ cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+ cluster_list_add_tail(&p->free_clusters, cluster_info,
+ idx);
+ }
+ }
+ return nr_extents;
+}
+
+/*
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
+ */
+static bool swap_discardable(struct swap_info_struct *si)
+{
+ struct request_queue *q = bdev_get_queue(si->bdev);
+
+ if (!q || !blk_queue_discard(q))
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+ struct swap_info_struct *p;
+ struct filename *name;
+ struct file *swap_file = NULL;
+ struct address_space *mapping;
+ int prio;
+ int error;
+ union swap_header *swap_header;
+ int nr_extents;
+ sector_t span;
+ unsigned long maxpages;
+ unsigned char *swap_map = NULL;
+ struct swap_cluster_info *cluster_info = NULL;
+ unsigned long *frontswap_map = NULL;
+ struct page *page = NULL;
+ struct inode *inode = NULL;
+ bool inced_nr_rotate_swap = false;
+
+ if (swap_flags & ~SWAP_FLAGS_VALID)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!swap_avail_heads)
+ return -ENOMEM;
+
+ p = alloc_swap_info();
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ INIT_WORK(&p->discard_work, swap_discard_work);
+
+ name = getname(specialfile);
+ if (IS_ERR(name)) {
+ error = PTR_ERR(name);
+ name = NULL;
+ goto bad_swap;
+ }
+ swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(swap_file)) {
+ error = PTR_ERR(swap_file);
+ swap_file = NULL;
+ goto bad_swap;
+ }
+
+ p->swap_file = swap_file;
+ mapping = swap_file->f_mapping;
+ inode = mapping->host;
+
+ error = claim_swapfile(p, inode);
+ if (unlikely(error))
+ goto bad_swap;
+
+ inode_lock(inode);
+ if (IS_SWAPFILE(inode)) {
+ error = -EBUSY;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
+ * Read the swap header.
+ */
+ if (!mapping->a_ops->readpage) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+ page = read_mapping_page(mapping, 0, swap_file);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ goto bad_swap_unlock_inode;
+ }
+ swap_header = kmap(page);
+
+ maxpages = read_swap_header(p, swap_header, inode);
+ if (unlikely(!maxpages)) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vzalloc(maxpages);
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
+
+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
+ p->flags |= SWP_STABLE_WRITES;
+
+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
+ p->flags |= SWP_SYNCHRONOUS_IO;
+
+ if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ int cpu;
+ unsigned long ci, nr_cluster;
+
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!p->cluster_next_cpu) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
+ /*
+ * select a random position to start with to help wear leveling
+ * SSD
+ */
+ for_each_possible_cpu(cpu) {
+ per_cpu(*p->cluster_next_cpu, cpu) =
+ 1 + prandom_u32_max(p->highest_bit);
+ }
+ nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+
+ cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
+ GFP_KERNEL);
+ if (!cluster_info) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
+
+ for (ci = 0; ci < nr_cluster; ci++)
+ spin_lock_init(&((cluster_info + ci)->lock));
+
+ p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!p->percpu_cluster) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
+ cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+ cluster_set_null(&cluster->index);
+ }
+ } else {
+ atomic_inc(&nr_rotate_swap);
+ inced_nr_rotate_swap = true;
+ }
+
+ error = swap_cgroup_swapon(p->type, maxpages);
+ if (error)
+ goto bad_swap_unlock_inode;
+
+ nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+ cluster_info, maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (IS_ENABLED(CONFIG_FRONTSWAP))
+ frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
+ sizeof(long),
+ GFP_KERNEL);
+
+ if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ /*
+ * When discard is enabled for swap with no particular
+ * policy flagged, we set all swap discard flags here in
+ * order to sustain backward compatibility with older
+ * swapon(8) releases.
+ */
+ p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+ SWP_PAGE_DISCARD);
+
+ /*
+ * By flagging sys_swapon, a sysadmin can tell us to
+ * either do single-time area discards only, or to just
+ * perform discards for released swap page-clusters.
+ * Now it's time to adjust the p->flags accordingly.
+ */
+ if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+ p->flags &= ~SWP_PAGE_DISCARD;
+ else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+ p->flags &= ~SWP_AREA_DISCARD;
+
+ /* issue a swapon-time discard if it's still required */
+ if (p->flags & SWP_AREA_DISCARD) {
+ int err = discard_swap(p);
+ if (unlikely(err))
+ pr_err("swapon: discard_swap(%p): %d\n",
+ p, err);
+ }
+ }
+
+ error = init_swap_address_space(p->type, maxpages);
+ if (error)
+ goto bad_swap_unlock_inode;
+
+ /*
+ * Flush any pending IO and dirty mappings before we start using this
+ * swap device.
+ */
+ inode->i_flags |= S_SWAPFILE;
+ error = inode_drain_writes(inode);
+ if (error) {
+ inode->i_flags &= ~S_SWAPFILE;
+ goto free_swap_address_space;
+ }
+
+ mutex_lock(&swapon_mutex);
+ prio = -1;
+ if (swap_flags & SWAP_FLAG_PREFER)
+ prio =
+ (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
+
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
+ nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (p->flags & SWP_AREA_DISCARD) ? "s" : "",
+ (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
+ (frontswap_map) ? "FS" : "");
+
+ mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
+ error = 0;
+ goto out;
+free_swap_address_space:
+ exit_swap_address_space(p->type);
+bad_swap_unlock_inode:
+ inode_unlock(inode);
+bad_swap:
+ free_percpu(p->percpu_cluster);
+ p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
+ if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+ set_blocksize(p->bdev, p->old_block_size);
+ blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ }
+ inode = NULL;
+ destroy_swap_extents(p);
+ swap_cgroup_swapoff(p->type);
+ spin_lock(&swap_lock);
+ p->swap_file = NULL;
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+ vfree(swap_map);
+ kvfree(cluster_info);
+ kvfree(frontswap_map);
+ if (inced_nr_rotate_swap)
+ atomic_dec(&nr_rotate_swap);
+ if (swap_file)
+ filp_close(swap_file, NULL);
+out:
+ if (page && !IS_ERR(page)) {
+ kunmap(page);
+ put_page(page);
+ }
+ if (name)
+ putname(name);
+ if (inode)
+ inode_unlock(inode);
+ if (!error)
+ enable_swap_slots_cache();
+ return error;
+}
+
+void si_swapinfo(struct sysinfo *val)
+{
+ unsigned int type;
+ unsigned long nr_to_be_unused = 0;
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *si = swap_info[type];
+
+ if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
+ nr_to_be_unused += si->inuse_pages;
+ }
+ val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
+ val->totalswap = total_swap_pages + nr_to_be_unused;
+ spin_unlock(&swap_lock);
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
+ */
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+ struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
+ unsigned long offset;
+ unsigned char count;
+ unsigned char has_cache;
+ int err = -EINVAL;
+
+ p = get_swap_device(entry);
+ if (!p)
+ goto out;
+
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(p, offset);
+
+ count = p->swap_map[offset];
+
+ /*
+ * swapin_readahead() doesn't check if a swap entry is valid, so the
+ * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+ */
+ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+ err = -ENOENT;
+ goto unlock_out;
+ }
+
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+ err = 0;
+
+ if (usage == SWAP_HAS_CACHE) {
+
+ /* set SWAP_HAS_CACHE if there is no cache and entry is used */
+ if (!has_cache && count)
+ has_cache = SWAP_HAS_CACHE;
+ else if (has_cache) /* someone else added cache */
+ err = -EEXIST;
+ else /* no users remaining */
+ err = -ENOENT;
+
+ } else if (count || has_cache) {
+
+ if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ count += usage;
+ else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+ err = -EINVAL;
+ else if (swap_count_continued(p, offset, count))
+ count = COUNT_CONTINUED;
+ else
+ err = -ENOMEM;
+ } else
+ err = -ENOENT; /* unused swap entry */
+
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
+
+unlock_out:
+ unlock_cluster_or_swap_info(p, ci);
+out:
+ if (p)
+ put_swap_device(p);
+ return err;
+}
+
+/*
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+ __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
+
+/*
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated. Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
+ */
+int swap_duplicate(swp_entry_t entry)
+{
+ int err = 0;
+
+ while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+ err = add_swap_count_continuation(entry, GFP_ATOMIC);
+ return err;
+}
+
+/*
+ * @entry: swap entry for which we allocate swap cache.
+ *
+ * Called when allocating swap cache for existing swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EEXIST means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+ return __swap_duplicate(entry, SWAP_HAS_CACHE);
+}
+
+struct swap_info_struct *swp_swap_info(swp_entry_t entry)
+{
+ return swap_type_to_swap_info(swp_type(entry));
+}
+
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page) };
+ return swp_swap_info(entry);
+}
+
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+ return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+
+pgoff_t __page_file_index(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ struct page *head;
+ struct page *page;
+ struct page *list_page;
+ pgoff_t offset;
+ unsigned char count;
+ int ret = 0;
+
+ /*
+ * When debugging, it's easier to use __GFP_ZERO here; but it's better
+ * for latency not to zero a page while GFP_ATOMIC and holding locks.
+ */
+ page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+ si = get_swap_device(entry);
+ if (!si) {
+ /*
+ * An acceptable race has occurred since the failing
+ * __swap_duplicate(): the swap device may be swapoff
+ */
+ goto outer;
+ }
+ spin_lock(&si->lock);
+
+ offset = swp_offset(entry);
+
+ ci = lock_cluster(si, offset);
+
+ count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+
+ if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+ /*
+ * The higher the swap count, the more likely it is that tasks
+ * will race to add swap count continuation: we need to avoid
+ * over-provisioning.
+ */
+ goto out;
+ }
+
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+ * no architecture is using highmem pages for kernel page tables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
+ */
+ head = vmalloc_to_page(si->swap_map + offset);
+ offset &= ~PAGE_MASK;
+
+ spin_lock(&si->cont_lock);
+ /*
+ * Page allocation does not initialize the page's lru field,
+ * but it does always reset its private field.
+ */
+ if (!page_private(head)) {
+ BUG_ON(count & COUNT_CONTINUED);
+ INIT_LIST_HEAD(&head->lru);
+ set_page_private(head, SWP_CONTINUED);
+ si->flags |= SWP_CONTINUED;
+ }
+
+ list_for_each_entry(list_page, &head->lru, lru) {
+ unsigned char *map;
+
+ /*
+ * If the previous map said no continuation, but we've found
+ * a continuation page, free our allocation and use this one.
+ */
+ if (!(count & COUNT_CONTINUED))
+ goto out_unlock_cont;
+
+ map = kmap_atomic(list_page) + offset;
+ count = *map;
+ kunmap_atomic(map);
+
+ /*
+ * If this continuation count now has some space in it,
+ * free our allocation and use this one.
+ */
+ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+ goto out_unlock_cont;
+ }
+
+ list_add_tail(&page->lru, &head->lru);
+ page = NULL; /* now it's attached, don't free it */
+out_unlock_cont:
+ spin_unlock(&si->cont_lock);
+out:
+ unlock_cluster(ci);
+ spin_unlock(&si->lock);
+ put_swap_device(si);
+outer:
+ if (page)
+ __free_page(page);
+ return ret;
+}
+
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+ pgoff_t offset, unsigned char count)
+{
+ struct page *head;
+ struct page *page;
+ unsigned char *map;
+ bool ret;
+
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head) != SWP_CONTINUED) {
+ BUG_ON(count & COUNT_CONTINUED);
+ return false; /* need to add count continuation */
+ }
+
+ spin_lock(&si->cont_lock);
+ offset &= ~PAGE_MASK;
+ page = list_next_entry(head, lru);
+ map = kmap_atomic(page) + offset;
+
+ if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
+ goto init_map; /* jump over SWAP_CONT_MAX checks */
+
+ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+ /*
+ * Think of how you add 1 to 999
+ */
+ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+ kunmap_atomic(map);
+ page = list_next_entry(page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page) + offset;
+ }
+ if (*map == SWAP_CONT_MAX) {
+ kunmap_atomic(map);
+ page = list_next_entry(page, lru);
+ if (page == head) {
+ ret = false; /* add count continuation */
+ goto out;
+ }
+ map = kmap_atomic(page) + offset;
+init_map: *map = 0; /* we didn't zero the page */
+ }
+ *map += 1;
+ kunmap_atomic(map);
+ while ((page = list_prev_entry(page, lru)) != head) {
+ map = kmap_atomic(page) + offset;
+ *map = COUNT_CONTINUED;
+ kunmap_atomic(map);
+ }
+ ret = true; /* incremented */
+
+ } else { /* decrementing */
+ /*
+ * Think of how you subtract 1 from 1000
+ */
+ BUG_ON(count != COUNT_CONTINUED);
+ while (*map == COUNT_CONTINUED) {
+ kunmap_atomic(map);
+ page = list_next_entry(page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page) + offset;
+ }
+ BUG_ON(*map == 0);
+ *map -= 1;
+ if (*map == 0)
+ count = 0;
+ kunmap_atomic(map);
+ while ((page = list_prev_entry(page, lru)) != head) {
+ map = kmap_atomic(page) + offset;
+ *map = SWAP_CONT_MAX | count;
+ count = COUNT_CONTINUED;
+ kunmap_atomic(map);
+ }
+ ret = count == COUNT_CONTINUED;
+ }
+out:
+ spin_unlock(&si->cont_lock);
+ return ret;
+}
+
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+ pgoff_t offset;
+
+ for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+ struct page *head;
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head)) {
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, &head->lru, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ }
+ }
+}
+
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si, *next;
+ int nid = page_to_nid(page);
+
+ if (!(gfp_mask & __GFP_IO))
+ return;
+
+ if (!blk_cgroup_congested())
+ return;
+
+ /*
+ * We've already scheduled a throttle, avoid taking the global swap
+ * lock.
+ */
+ if (current->throttle_queue)
+ return;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+ avail_lists[nid]) {
+ if (si->bdev) {
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
+ break;
+ }
+ }
+ spin_unlock(&swap_avail_lock);
+}
+#endif
+
+static int __init swapfile_init(void)
+{
+ int nid;
+
+ swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+ GFP_KERNEL);
+ if (!swap_avail_heads) {
+ pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+ return -ENOMEM;
+ }
+
+ for_each_node(nid)
+ plist_head_init(&swap_avail_heads[nid]);
+
+ return 0;
+}
+subsys_initcall(swapfile_init);
diff --git a/mm/truncate.c b/mm/truncate.c
new file mode 100644
index 000000000..8914ca4ce
--- /dev/null
+++ b/mm/truncate.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/backing-dev.h>
+#include <linux/dax.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/buffer_head.h> /* grr. try_to_release_page,
+ do_invalidatepage */
+#include <linux/shmem_fs.h>
+#include <linux/cleancache.h>
+#include <linux/rmap.h>
+#include "internal.h"
+
+/*
+ * Regular page slots are stabilized by the page lock even without the tree
+ * itself locked. These unlocked entries need verification under the tree
+ * lock.
+ */
+static inline void __clear_shadow_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ xas_set_update(&xas, workingset_update_node);
+ if (xas_load(&xas) != entry)
+ return;
+ xas_store(&xas, NULL);
+ mapping->nrexceptional--;
+}
+
+static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
+ void *entry)
+{
+ xa_lock_irq(&mapping->i_pages);
+ __clear_shadow_entry(mapping, index, entry);
+ xa_unlock_irq(&mapping->i_pages);
+}
+
+/*
+ * Unconditionally remove exceptional entries. Usually called from truncate
+ * path. Note that the pagevec may be altered by this function by removing
+ * exceptional entries similar to what pagevec_remove_exceptionals does.
+ */
+static void truncate_exceptional_pvec_entries(struct address_space *mapping,
+ struct pagevec *pvec, pgoff_t *indices,
+ pgoff_t end)
+{
+ int i, j;
+ bool dax, lock;
+
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ for (j = 0; j < pagevec_count(pvec); j++)
+ if (xa_is_value(pvec->pages[j]))
+ break;
+
+ if (j == pagevec_count(pvec))
+ return;
+
+ dax = dax_mapping(mapping);
+ lock = !dax && indices[j] < end;
+ if (lock)
+ xa_lock_irq(&mapping->i_pages);
+
+ for (i = j; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ pgoff_t index = indices[i];
+
+ if (!xa_is_value(page)) {
+ pvec->pages[j++] = page;
+ continue;
+ }
+
+ if (index >= end)
+ continue;
+
+ if (unlikely(dax)) {
+ dax_delete_mapping_entry(mapping, index);
+ continue;
+ }
+
+ __clear_shadow_entry(mapping, index, page);
+ }
+
+ if (lock)
+ xa_unlock_irq(&mapping->i_pages);
+ pvec->nr = j;
+}
+
+/*
+ * Invalidate exceptional entry if easily possible. This handles exceptional
+ * entries for invalidate_inode_pages().
+ */
+static int invalidate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself, or for DAX we do nothing. */
+ if (shmem_mapping(mapping) || dax_mapping(mapping))
+ return 1;
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
+/*
+ * Invalidate exceptional entry if clean. This handles exceptional entries for
+ * invalidate_inode_pages2() so for DAX it evicts only clean entries.
+ */
+static int invalidate_exceptional_entry2(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry_sync(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
+/**
+ * do_invalidatepage - invalidate part or all of a page
+ * @page: the page which is affected
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point. Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ void (*invalidatepage)(struct page *, unsigned int, unsigned int);
+
+ invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
+ if (!invalidatepage)
+ invalidatepage = block_invalidatepage;
+#endif
+ if (invalidatepage)
+ (*invalidatepage)(page, offset, length);
+}
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes orphaned. It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_fault().
+ *
+ * We need to bail out if page->mapping is no longer equal to the original
+ * mapping. This happens a) when the VM reclaimed the page while we waited on
+ * its lock, b) when a concurrent invalidate_mapping_pages got there first and
+ * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+ */
+static void truncate_cleanup_page(struct page *page)
+{
+ if (page_mapped(page))
+ unmap_mapping_page(page);
+
+ if (page_has_private(page))
+ do_invalidatepage(page, 0, thp_size(page));
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ * Hence dirty accounting check is placed after invalidation.
+ */
+ cancel_dirty_page(page);
+ ClearPageMappedToDisk(page);
+}
+
+/*
+ * This is for invalidate_mapping_pages(). That function can be called at
+ * any time, and is not supposed to throw away dirty pages. But pages can
+ * be marked dirty at any time too, so use remove_mapping which safely
+ * discards clean, unused pages.
+ *
+ * Returns non-zero if the page was successfully invalidated.
+ */
+static int
+invalidate_complete_page(struct address_space *mapping, struct page *page)
+{
+ int ret;
+
+ if (page->mapping != mapping)
+ return 0;
+
+ if (page_has_private(page) && !try_to_release_page(page, 0))
+ return 0;
+
+ ret = remove_mapping(mapping, page);
+
+ return ret;
+}
+
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (page->mapping != mapping)
+ return -EIO;
+
+ truncate_cleanup_page(page);
+ delete_from_page_cache(page);
+ return 0;
+}
+
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return -EINVAL;
+ /*
+ * Only punch for normal data pages for now.
+ * Handling other types like directories would need more auditing.
+ */
+ if (!S_ISREG(mapping->host->i_mode))
+ return -EIO;
+ return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+ if (PageDirty(page) || PageWriteback(page))
+ return 0;
+ if (page_mapped(page))
+ return 0;
+ return invalidate_complete_page(mapping, page);
+}
+
+/**
+ * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate (inclusive)
+ *
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
+ *
+ * Truncate takes two passes - the first pass is nonblocking. It will not
+ * block on page locks and it will not block on writeback. The second pass
+ * will wait. This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * We pass down the cache-hot hint to the page freeing code. Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ *
+ * Note that since ->invalidatepage() accepts range to invalidate
+ * truncate_inode_pages_range is able to handle cases where lend + 1 is not
+ * page aligned properly.
+ */
+void truncate_inode_pages_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ pgoff_t start; /* inclusive */
+ pgoff_t end; /* exclusive */
+ unsigned int partial_start; /* inclusive */
+ unsigned int partial_end; /* exclusive */
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index;
+ int i;
+
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ goto out;
+
+ /* Offsets within partial pages */
+ partial_start = lstart & (PAGE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_SIZE - 1);
+
+ /*
+ * 'start' and 'end' always covers the range of pages to be fully
+ * truncated. Partial pages are covered with 'partial_start' at the
+ * start of the range and 'partial_end' at the end of the range.
+ * Note that 'end' is exclusive while 'lend' is inclusive.
+ */
+ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (lend == -1)
+ /*
+ * lend == -1 indicates end-of-file so we have to set 'end'
+ * to the highest possible pgoff_t and since the type is
+ * unsigned we're using -1.
+ */
+ end = -1;
+ else
+ end = (lend + 1) >> PAGE_SHIFT;
+
+ pagevec_init(&pvec);
+ index = start;
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ /*
+ * Pagevec array has exceptional entries and we may also fail
+ * to lock some pages. So we store pages that can be deleted
+ * in a new pagevec.
+ */
+ struct pagevec locked_pvec;
+
+ pagevec_init(&locked_pvec);
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (xa_is_value(page))
+ continue;
+
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page_to_index(page) != index);
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ continue;
+ }
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ pagevec_add(&locked_pvec, page);
+ }
+ for (i = 0; i < pagevec_count(&locked_pvec); i++)
+ truncate_cleanup_page(locked_pvec.pages[i]);
+ delete_from_page_cache_batch(mapping, &locked_pvec);
+ for (i = 0; i < pagevec_count(&locked_pvec); i++)
+ unlock_page(locked_pvec.pages[i]);
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+ if (partial_start) {
+ struct page *page = find_lock_page(mapping, start - 1);
+ if (page) {
+ unsigned int top = PAGE_SIZE;
+ if (start > end) {
+ /* Truncation within a single page */
+ top = partial_end;
+ partial_end = 0;
+ }
+ wait_on_page_writeback(page);
+ zero_user_segment(page, partial_start, top);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, partial_start,
+ top - partial_start);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+ if (partial_end) {
+ struct page *page = find_lock_page(mapping, end);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user_segment(page, 0, partial_end);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, 0,
+ partial_end);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+ /*
+ * If the truncation happened within a single page no pages
+ * will be released, just zeroed, so we can bail out now.
+ */
+ if (start >= end)
+ goto out;
+
+ index = start;
+ for ( ; ; ) {
+ cond_resched();
+ if (!pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+ /* If all gone from start onwards, we're done */
+ if (index == start)
+ break;
+ /* Otherwise restart to make sure all gone */
+ index = start;
+ continue;
+ }
+ if (index == start && indices[0] >= end) {
+ /* All gone out of hole to be punched, we're done */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ break;
+ }
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end) {
+ /* Restart punch to make sure all gone */
+ index = start - 1;
+ break;
+ }
+
+ if (xa_is_value(page))
+ continue;
+
+ lock_page(page);
+ WARN_ON(page_to_index(page) != index);
+ wait_on_page_writeback(page);
+ truncate_inode_page(mapping, page);
+ unlock_page(page);
+ }
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+ pagevec_release(&pvec);
+ index++;
+ }
+
+out:
+ cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(truncate_inode_pages_range);
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+ truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
+EXPORT_SYMBOL(truncate_inode_pages);
+
+/**
+ * truncate_inode_pages_final - truncate *all* pages before inode dies
+ * @mapping: mapping to truncate
+ *
+ * Called under (and serialized by) inode->i_mutex.
+ *
+ * Filesystems have to use this in the .evict_inode path to inform the
+ * VM that this is the final truncate and the inode is going away.
+ */
+void truncate_inode_pages_final(struct address_space *mapping)
+{
+ unsigned long nrexceptional;
+ unsigned long nrpages;
+
+ /*
+ * Page reclaim can not participate in regular inode lifetime
+ * management (can't call iput()) and thus can race with the
+ * inode teardown. Tell it when the address space is exiting,
+ * so that it does not install eviction information after the
+ * final truncate has begun.
+ */
+ mapping_set_exiting(mapping);
+
+ /*
+ * When reclaim installs eviction entries, it increases
+ * nrexceptional first, then decreases nrpages. Make sure we see
+ * this in the right order or we might miss an entry.
+ */
+ nrpages = mapping->nrpages;
+ smp_rmb();
+ nrexceptional = mapping->nrexceptional;
+
+ if (nrpages || nrexceptional) {
+ /*
+ * As truncation uses a lockless tree lookup, cycle
+ * the tree lock to make sure any ongoing tree
+ * modification that does not see AS_EXITING is
+ * completed before starting the final truncate.
+ */
+ xa_lock_irq(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
+ }
+
+ /*
+ * Cleancache needs notification even if there are no pages or shadow
+ * entries.
+ */
+ truncate_inode_pages(mapping, 0);
+}
+EXPORT_SYMBOL(truncate_inode_pages_final);
+
+static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+{
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ pgoff_t index = start;
+ unsigned long ret;
+ unsigned long count = 0;
+ int i;
+
+ pagevec_init(&pvec);
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index > end)
+ break;
+
+ if (xa_is_value(page)) {
+ invalidate_exceptional_entry(mapping, index,
+ page);
+ continue;
+ }
+
+ if (!trylock_page(page))
+ continue;
+
+ WARN_ON(page_to_index(page) != index);
+
+ /* Middle of THP: skip */
+ if (PageTransTail(page)) {
+ unlock_page(page);
+ continue;
+ } else if (PageTransHuge(page)) {
+ index += HPAGE_PMD_NR - 1;
+ i += HPAGE_PMD_NR - 1;
+ /*
+ * 'end' is in the middle of THP. Don't
+ * invalidate the page as the part outside of
+ * 'end' could be still useful.
+ */
+ if (index > end) {
+ unlock_page(page);
+ continue;
+ }
+
+ /* Take a pin outside pagevec */
+ get_page(page);
+
+ /*
+ * Drop extra pins before trying to invalidate
+ * the huge page.
+ */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ }
+
+ ret = invalidate_inode_page(page);
+ unlock_page(page);
+ /*
+ * Invalidation is a hint that the page is no longer
+ * of interest and try to speed up its reclaim.
+ */
+ if (!ret) {
+ deactivate_file_page(page);
+ /* It is likely on the pagevec of a remote CPU */
+ if (nr_pagevec)
+ (*nr_pagevec)++;
+ }
+
+ if (PageTransHuge(page))
+ put_page(page);
+ count += ret;
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+ return count;
+}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ *
+ * Return: the number of the pages that were invalidated
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, NULL);
+}
+EXPORT_SYMBOL(invalidate_mapping_pages);
+
+/**
+ * This helper is similar with the above one, except that it accounts for pages
+ * that are likely on a pagevec and count them in @nr_pagevec, which will used by
+ * the caller.
+ */
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+{
+ __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
+}
+
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount. We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_page_list() has a temp ref on them, or because they're transiently
+ * sitting in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
+{
+ unsigned long flags;
+
+ if (page->mapping != mapping)
+ return 0;
+
+ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+ return 0;
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ if (PageDirty(page))
+ goto failed;
+
+ BUG_ON(page_has_private(page));
+ __delete_from_page_cache(page, NULL);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
+ put_page(page); /* pagecache ref */
+ return 1;
+failed:
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+ return 0;
+}
+
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+ if (!PageDirty(page))
+ return 0;
+ if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ return 0;
+ return mapping->a_ops->launder_page(page);
+}
+
+/**
+ * invalidate_inode_pages2_range - remove range of pages from an address_space
+ * @mapping: the address_space
+ * @start: the page offset 'from' which to invalidate
+ * @end: the page offset 'to' which to invalidate (inclusive)
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Return: -EBUSY if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ pgoff_t index;
+ int i;
+ int ret = 0;
+ int ret2 = 0;
+ int did_range_unmap = 0;
+
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ goto out;
+
+ pagevec_init(&pvec);
+ index = start;
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index > end)
+ break;
+
+ if (xa_is_value(page)) {
+ if (!invalidate_exceptional_entry2(mapping,
+ index, page))
+ ret = -EBUSY;
+ continue;
+ }
+
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
+ lock_page(page);
+ WARN_ON(page_to_index(page) != index);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ wait_on_page_writeback(page);
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
+ BUG_ON(page_mapped(page));
+
+ ret2 = do_launder_page(mapping, page);
+ if (ret2 == 0) {
+ if (!invalidate_complete_page2(mapping, page))
+ ret2 = -EBUSY;
+ }
+ if (ret2 < 0)
+ ret = ret2;
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+ /*
+ * For DAX we invalidate page tables after invalidating page cache. We
+ * could invalidate page tables while invalidating each entry however
+ * that would be expensive. And doing range unmapping before doesn't
+ * work as we have no cheap way to find whether page cache entry didn't
+ * get remapped later.
+ */
+ if (dax_mapping(mapping)) {
+ unmap_mapping_pages(mapping, start, end - start + 1, false);
+ }
+out:
+ cleancache_invalidate_inode(mapping);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
+
+/**
+ * invalidate_inode_pages2 - remove all pages from an address_space
+ * @mapping: the address_space
+ *
+ * Any pages which are found to be mapped into pagetables are unmapped prior to
+ * invalidation.
+ *
+ * Return: -EBUSY if any pages could not be invalidated.
+ */
+int invalidate_inode_pages2(struct address_space *mapping)
+{
+ return invalidate_inode_pages2_range(mapping, 0, -1);
+}
+EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t newsize)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+ truncate_inode_pages(mapping, newsize);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updates i_size and performs pagecache truncation (if
+ * necessary) to @newsize. It will be typically be called from the filesystem's
+ * setattr function when ATTR_SIZE is passed in.
+ *
+ * Must be called with a lock serializing truncates and writes (generally
+ * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize = inode->i_size;
+
+ i_size_write(inode, newsize);
+ if (newsize > oldsize)
+ pagecache_isize_extended(inode, oldsize, newsize);
+ truncate_pagecache(inode, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
+/**
+ * pagecache_isize_extended - update pagecache after extension of i_size
+ * @inode: inode for which i_size was extended
+ * @from: original inode size
+ * @to: new inode size
+ *
+ * Handle extension of inode size either caused by extending truncate or by
+ * write starting after current i_size. We mark the page straddling current
+ * i_size RO so that page_mkwrite() is called on the nearest write access to
+ * the page. This way filesystem can be sure that page_mkwrite() is called on
+ * the page before user writes to the page via mmap after the i_size has been
+ * changed.
+ *
+ * The function must be called after i_size is updated so that page fault
+ * coming after we unlock the page will already see the new i_size.
+ * The function must be called while we still hold i_mutex - this not only
+ * makes sure i_size is stable but also that userspace cannot observe new
+ * i_size value before we are prepared to store mmap writes at new inode size.
+ */
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
+{
+ int bsize = i_blocksize(inode);
+ loff_t rounded_from;
+ struct page *page;
+ pgoff_t index;
+
+ WARN_ON(to > inode->i_size);
+
+ if (from >= to || bsize == PAGE_SIZE)
+ return;
+ /* Page straddling @from will not have any hole block created? */
+ rounded_from = round_up(from, bsize);
+ if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
+ return;
+
+ index = from >> PAGE_SHIFT;
+ page = find_lock_page(inode->i_mapping, index);
+ /* Page not cached? Nothing to do */
+ if (!page)
+ return;
+ /*
+ * See clear_page_dirty_for_io() for details why set_page_dirty()
+ * is needed.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+}
+EXPORT_SYMBOL(pagecache_isize_extended);
+
+/**
+ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
+ * @inode: inode
+ * @lstart: offset of beginning of hole
+ * @lend: offset of last byte of hole
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t unmap_start = round_up(lstart, PAGE_SIZE);
+ loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
+ /*
+ * This rounding is currently just for example: unmap_mapping_range
+ * expands its hole outwards, whereas we want it to contract the hole
+ * inwards. However, existing callers of truncate_pagecache_range are
+ * doing their own page rounding first. Note that unmap_mapping_range
+ * allows holelen 0 for all, and we allow lend -1 for end of file.
+ */
+
+ /*
+ * Unlike in truncate_pagecache, unmap_mapping_range is called only
+ * once (before truncating pagecache), and without "even_cows" flag:
+ * hole-punching should not remove private COWed pages from the hole.
+ */
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ truncate_inode_pages_range(mapping, lstart, lend);
+}
+EXPORT_SYMBOL(truncate_pagecache_range);
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000..540968b48
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/thread_info.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
+#include <asm/sections.h>
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ * NOT_STACK: not at all on the stack
+ * GOOD_FRAME: fully within a valid stack frame
+ * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+ const void * const stack = task_stack_page(current);
+ const void * const stackend = stack + THREAD_SIZE;
+ int ret;
+
+ /* Object is not on the stack at all. */
+ if (obj + len <= stack || stackend <= obj)
+ return NOT_STACK;
+
+ /*
+ * Reject: object partially overlaps the stack (passing the
+ * check above means at least one end is within the stack,
+ * so if this check fails, the other end is outside the stack).
+ */
+ if (obj < stack || stackend < obj + len)
+ return BAD_STACK;
+
+ /* Check if object is safely within a valid frame. */
+ ret = arch_within_stack_frames(stack, stackend, obj, len);
+ if (ret)
+ return ret;
+
+ return GOOD_STACK;
+}
+
+/*
+ * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
+ * an unexpected state during a copy_from_user() or copy_to_user() call.
+ * There are several checks being performed on the buffer by the
+ * __check_object_size() function. Normal stack buffer usage should never
+ * trip the checks, and kernel text addressing will always trip the check.
+ * For cache objects, it is checking that only the whitelisted range of
+ * bytes for a given cache is being accessed (via the cache's usersize and
+ * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
+ * kmem_cache_create_usercopy() function to create the cache (and
+ * carefully audit the whitelist range).
+ */
+void usercopy_warn(const char *name, const char *detail, bool to_user,
+ unsigned long offset, unsigned long len)
+{
+ WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to",
+ name ? : "unknown?!",
+ detail ? " '" : "", detail ? : "", detail ? "'" : "",
+ offset, len);
+}
+
+void __noreturn usercopy_abort(const char *name, const char *detail,
+ bool to_user, unsigned long offset,
+ unsigned long len)
+{
+ pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to",
+ name ? : "unknown?!",
+ detail ? " '" : "", detail ? : "", detail ? "'" : "",
+ offset, len);
+
+ /*
+ * For greater effect, it would be nice to do do_group_exit(),
+ * but BUG() actually hooks all the lock-breaking and per-arch
+ * Oops code, so that is used here instead.
+ */
+ BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const unsigned long ptr, unsigned long n,
+ unsigned long low, unsigned long high)
+{
+ const unsigned long check_low = ptr;
+ unsigned long check_high = check_low + n;
+
+ /* Does not overlap if entirely above or entirely below. */
+ if (check_low >= high || check_high <= low)
+ return false;
+
+ return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline void check_kernel_text_object(const unsigned long ptr,
+ unsigned long n, bool to_user)
+{
+ unsigned long textlow = (unsigned long)_stext;
+ unsigned long texthigh = (unsigned long)_etext;
+ unsigned long textlow_linear, texthigh_linear;
+
+ if (overlaps(ptr, n, textlow, texthigh))
+ usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);
+
+ /*
+ * Some architectures have virtual memory mappings with a secondary
+ * mapping of the kernel text, i.e. there is more than one virtual
+ * kernel address that points to the kernel image. It is usually
+ * when there is a separate linear physical memory mapping, in that
+ * __pa() is not just the reverse of __va(). This can be detected
+ * and checked:
+ */
+ textlow_linear = (unsigned long)lm_alias(textlow);
+ /* No different mapping: we're done. */
+ if (textlow_linear == textlow)
+ return;
+
+ /* Check the secondary mapping... */
+ texthigh_linear = (unsigned long)lm_alias(texthigh);
+ if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+ usercopy_abort("linear kernel text", NULL, to_user,
+ ptr - textlow_linear, n);
+}
+
+static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
+ bool to_user)
+{
+ /* Reject if object wraps past end of memory. */
+ if (ptr + (n - 1) < ptr)
+ usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);
+
+ /* Reject if NULL or ZERO-allocation. */
+ if (ZERO_OR_NULL_PTR(ptr))
+ usercopy_abort("null address", NULL, to_user, ptr, n);
+}
+
+/* Checks for allocs that are marked in some way as spanning multiple pages. */
+static inline void check_page_span(const void *ptr, unsigned long n,
+ struct page *page, bool to_user)
+{
+#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
+ const void *end = ptr + n - 1;
+ struct page *endpage;
+ bool is_reserved, is_cma;
+
+ /*
+ * Sometimes the kernel data regions are not marked Reserved (see
+ * check below). And sometimes [_sdata,_edata) does not cover
+ * rodata and/or bss, so check each range explicitly.
+ */
+
+ /* Allow reads of kernel rodata region (if not marked as Reserved). */
+ if (ptr >= (const void *)__start_rodata &&
+ end <= (const void *)__end_rodata) {
+ if (!to_user)
+ usercopy_abort("rodata", NULL, to_user, 0, n);
+ return;
+ }
+
+ /* Allow kernel data region (if not marked as Reserved). */
+ if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+ return;
+
+ /* Allow kernel bss region (if not marked as Reserved). */
+ if (ptr >= (const void *)__bss_start &&
+ end <= (const void *)__bss_stop)
+ return;
+
+ /* Is the object wholly within one base page? */
+ if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+ ((unsigned long)end & (unsigned long)PAGE_MASK)))
+ return;
+
+ /* Allow if fully inside the same compound (__GFP_COMP) page. */
+ endpage = virt_to_head_page(end);
+ if (likely(endpage == page))
+ return;
+
+ /*
+ * Reject if range is entirely either Reserved (i.e. special or
+ * device memory), or CMA. Otherwise, reject since the object spans
+ * several independently allocated pages.
+ */
+ is_reserved = PageReserved(page);
+ is_cma = is_migrate_cma_page(page);
+ if (!is_reserved && !is_cma)
+ usercopy_abort("spans multiple pages", NULL, to_user, 0, n);
+
+ for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+ page = virt_to_head_page(ptr);
+ if (is_reserved && !PageReserved(page))
+ usercopy_abort("spans Reserved and non-Reserved pages",
+ NULL, to_user, 0, n);
+ if (is_cma && !is_migrate_cma_page(page))
+ usercopy_abort("spans CMA and non-CMA pages", NULL,
+ to_user, 0, n);
+ }
+#endif
+}
+
+static inline void check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ struct page *page;
+
+ if (!virt_addr_valid(ptr))
+ return;
+
+ /*
+ * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+ * highmem page or fallback to virt_to_page(). The following
+ * is effectively a highmem-aware virt_to_head_page().
+ */
+ page = compound_head(kmap_to_page((void *)ptr));
+
+ if (PageSlab(page)) {
+ /* Check slab allocator for flags and size. */
+ __check_heap_object(ptr, n, page, to_user);
+ } else {
+ /* Verify object does not incorrectly span multiple pages. */
+ check_page_span(ptr, n, page, to_user);
+ }
+}
+
+static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+ if (static_branch_unlikely(&bypass_usercopy_checks))
+ return;
+
+ /* Skip all tests if size is zero. */
+ if (!n)
+ return;
+
+ /* Check for invalid addresses. */
+ check_bogus_address((const unsigned long)ptr, n, to_user);
+
+ /* Check for bad stack object. */
+ switch (check_stack_object(ptr, n)) {
+ case NOT_STACK:
+ /* Object is not touching the current process stack. */
+ break;
+ case GOOD_FRAME:
+ case GOOD_STACK:
+ /*
+ * Object is either in the correct frame (when it
+ * is possible to check) or just generally on the
+ * process stack (when frame checking not available).
+ */
+ return;
+ default:
+ usercopy_abort("process stack", NULL, to_user, 0, n);
+ }
+
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
+ /* Check for object in kernel to avoid text exposure. */
+ check_kernel_text_object((const unsigned long)ptr, n, to_user);
+}
+EXPORT_SYMBOL(__check_object_size);
+
+static bool enable_checks __initdata = true;
+
+static int __init parse_hardened_usercopy(char *str)
+{
+ if (strtobool(str, &enable_checks))
+ pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
+ str);
+ return 1;
+}
+
+__setup("hardened_usercopy=", parse_hardened_usercopy);
+
+static int __init set_hardened_usercopy(void)
+{
+ if (enable_checks == false)
+ static_branch_enable(&bypass_usercopy_checks);
+ return 1;
+}
+
+late_initcall(set_hardened_usercopy);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000..078d95cd3
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,694 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mm/userfaultfd.c
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static __always_inline
+struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ /*
+ * Make sure that the dst range is both valid and fully within a
+ * single existing vma.
+ */
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma)
+ return NULL;
+
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ return NULL;
+
+ /*
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ return NULL;
+
+ return dst_vma;
+}
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep,
+ bool wp_copy)
+{
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ int ret;
+ struct page *page;
+ pgoff_t offset, max_off;
+ struct inode *inode;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+ if (!page)
+ goto out;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_lock */
+ if (unlikely(ret)) {
+ ret = -ENOENT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+
+ flush_dcache_page(page);
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ ret = -ENOMEM;
+ if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
+ goto out_release;
+
+ _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+ if (dst_vma->vm_flags & VM_WRITE) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (dst_vma->vm_file) {
+ /* the shmem MAP_PRIVATE case requires checking the i_size */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_release_uncharge_unlock;
+ }
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ inc_mm_counter(dst_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out_release:
+ put_page(page);
+ goto out;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ pgoff_t offset, max_off;
+ struct inode *inode;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (dst_vma->vm_file) {
+ /* the shmem MAP_PRIVATE case requires checking the i_size */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return NULL;
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ return pmd_alloc(mm, pud, address);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * called with mmap_lock held, it will release mmap_lock before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ ssize_t err;
+ pte_t *dst_pte;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+ unsigned long vma_hpagesize;
+ pgoff_t idx;
+ u32 hash;
+ struct address_space *mapping;
+
+ /*
+ * There is no default zero huge page for all huge page sizes as
+ * supported by hugetlb. A PMD_SIZE huge pages may exist as used
+ * by THP. Since we can not reliably insert a zero page, this
+ * feature is not supported.
+ */
+ if (zeropage) {
+ mmap_read_unlock(dst_mm);
+ return -EINVAL;
+ }
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+ /*
+ * Validate alignment based on huge page size
+ */
+ err = -EINVAL;
+ if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+ goto out_unlock;
+
+retry:
+ /*
+ * On routine entry dst_vma is set. If we had to drop mmap_lock and
+ * retry, dst_vma will be set to NULL and we must lookup again.
+ */
+ if (!dst_vma) {
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
+ if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
+ goto out_unlock;
+
+ err = -EINVAL;
+ if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+ goto out_unlock;
+
+ vm_shared = dst_vma->vm_flags & VM_SHARED;
+ }
+
+ /*
+ * If not shared, ensure the dst_vma has a anon_vma.
+ */
+ err = -ENOMEM;
+ if (!vm_shared) {
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+ }
+
+ while (src_addr < src_start + len) {
+ pte_t dst_pteval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+
+ /*
+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+ * i_mmap_rwsem ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
+ */
+ mapping = dst_vma->vm_file->f_mapping;
+ i_mmap_lock_read(mapping);
+ idx = linear_page_index(dst_vma, dst_addr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ err = -ENOMEM;
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+ if (!dst_pte) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ goto out_unlock;
+ }
+
+ err = -EEXIST;
+ dst_pteval = huge_ptep_get(dst_pte);
+ if (!huge_pte_none(dst_pteval)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ goto out_unlock;
+ }
+
+ err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+ dst_addr, src_addr, &page);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ vm_alloc_shared = vm_shared;
+
+ cond_resched();
+
+ if (unlikely(err == -ENOENT)) {
+ mmap_read_unlock(dst_mm);
+ BUG_ON(!page);
+
+ err = copy_huge_page_from_user(page,
+ (const void __user *)src_addr,
+ vma_hpagesize / PAGE_SIZE,
+ true);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ mmap_read_lock(dst_mm);
+
+ dst_vma = NULL;
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += vma_hpagesize;
+ src_addr += vma_hpagesize;
+ copied += vma_hpagesize;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ mmap_read_unlock(dst_mm);
+out:
+ if (page) {
+ /*
+ * We encountered an error and are about to free a newly
+ * allocated huge page.
+ *
+ * Reservation handling is very subtle, and is different for
+ * private and shared mappings. See the routine
+ * restore_reserve_on_error for details. Unfortunately, we
+ * can not call restore_reserve_on_error now as it would
+ * require holding mmap_lock.
+ *
+ * If a reservation for the page existed in the reservation
+ * map of a private mapping, the map was modified to indicate
+ * the reservation was consumed when the page was allocated.
+ * We clear the PagePrivate flag now so that the global
+ * reserve count will not be incremented in free_huge_page.
+ * The reservation map will still indicate the reservation
+ * was consumed and possibly prevent later page allocation.
+ * This is better than leaking a global reservation. If no
+ * reservation existed, it is still safe to clear PagePrivate
+ * as no adjustments to reservation counts were made during
+ * allocation.
+ *
+ * The reservation map for shared mappings indicates which
+ * pages have reservations. When a huge page is allocated
+ * for an address with a reservation, no change is made to
+ * the reserve map. In this case PagePrivate will be set
+ * to indicate that the global reservation count should be
+ * incremented when the page is freed. This is the desired
+ * behavior. However, when a huge page is allocated for an
+ * address without a reservation a reservation entry is added
+ * to the reservation map, and PagePrivate will not be set.
+ * When the page is freed, the global reserve count will NOT
+ * be incremented and it will appear as though we have leaked
+ * reserved page. In this case, set PagePrivate so that the
+ * global reserve count will be incremented to match the
+ * reservation map entry which was created.
+ *
+ * Note that vm_alloc_shared is based on the flags of the vma
+ * for which the page was originally allocated. dst_vma could
+ * be different or NULL on error.
+ */
+ if (vm_alloc_shared)
+ SetPagePrivate(page);
+ else
+ ClearPagePrivate(page);
+ put_page(page);
+ }
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+/* fail at build time if gcc attempts to use this */
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage);
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **page,
+ bool zeropage,
+ bool wp_copy)
+{
+ ssize_t err;
+
+ /*
+ * The normal page fault path for a shmem will invoke the
+ * fault, fill the hole in the file and COW it right away. The
+ * result generates plain anonymous memory. So when we are
+ * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
+ * generate anonymous memory directly without actually filling
+ * the hole. For the MAP_PRIVATE case the robustness check
+ * only happens in the pagetable (to verify it's still none)
+ * and not in the radix tree.
+ */
+ if (!(dst_vma->vm_flags & VM_SHARED)) {
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, page,
+ wp_copy);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ } else {
+ VM_WARN_ON_ONCE(wp_copy);
+ if (!zeropage)
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr,
+ src_addr, page);
+ else
+ err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ }
+
+ return err;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage,
+ bool *mmap_changing,
+ __u64 mode)
+{
+ struct vm_area_struct *dst_vma;
+ ssize_t err;
+ pmd_t *dst_pmd;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+ bool wp_copy;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(dst_start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(src_start + len <= src_start);
+ BUG_ON(dst_start + len <= dst_start);
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+retry:
+ mmap_read_lock(dst_mm);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, dst_start, len);
+ if (!dst_vma)
+ goto out_unlock;
+
+ err = -EINVAL;
+ /*
+ * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+ * it will overwrite vm_ops, so vma_is_anonymous must return false.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+ dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+
+ /*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ wp_copy = mode & UFFDIO_COPY_MODE_WP;
+ if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ /*
+ * If this is a HUGETLB vma, pass off to appropriate routine
+ */
+ if (is_vm_hugetlb_page(dst_vma))
+ return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ src_start, len, zeropage);
+
+ if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
+ goto out_unlock;
+
+ /*
+ * Ensure the dst_vma has a anon_vma or this page
+ * would get a NULL anon_vma when moved in the
+ * dst_vma.
+ */
+ err = -ENOMEM;
+ if (!(dst_vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+
+ while (src_addr < src_start + len) {
+ pmd_t dst_pmdval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+
+ dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+ if (unlikely(!dst_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ dst_pmdval = pmd_read_atomic(dst_pmd);
+ /*
+ * If the dst_pmd is mapped as THP don't
+ * override it and just be strict.
+ */
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_pmd))) {
+ err = -ENOMEM;
+ break;
+ }
+ /* If an huge pmd materialized from under us fail */
+ if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ err = -EFAULT;
+ break;
+ }
+
+ BUG_ON(pmd_none(*dst_pmd));
+ BUG_ON(pmd_trans_huge(*dst_pmd));
+
+ err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ src_addr, &page, zeropage, wp_copy);
+ cond_resched();
+
+ if (unlikely(err == -ENOENT)) {
+ void *page_kaddr;
+
+ mmap_read_unlock(dst_mm);
+ BUG_ON(!page);
+
+ page_kaddr = kmap(page);
+ err = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap(page);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ flush_dcache_page(page);
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += PAGE_SIZE;
+ src_addr += PAGE_SIZE;
+ copied += PAGE_SIZE;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ mmap_read_unlock(dst_mm);
+out:
+ if (page)
+ put_page(page);
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ bool *mmap_changing, __u64 mode)
+{
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+ mmap_changing, mode);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool *mmap_changing)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+}
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, bool enable_wp, bool *mmap_changing)
+{
+ struct vm_area_struct *dst_vma;
+ pgprot_t newprot;
+ int err;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(start + len <= start);
+
+ mmap_read_lock(dst_mm);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ err = -ENOENT;
+ dst_vma = find_dst_vma(dst_mm, start, len);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (!userfaultfd_wp(dst_vma))
+ goto out_unlock;
+ if (!vma_is_anonymous(dst_vma))
+ goto out_unlock;
+
+ if (enable_wp)
+ newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ else
+ newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+ change_protection(dst_vma, start, start + len, newprot,
+ enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+
+ err = 0;
+out_unlock:
+ mmap_read_unlock(dst_mm);
+ return err;
+}
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 000000000..25bfda774
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,1025 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
+
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+ if (!is_kernel_rodata((unsigned long)x))
+ kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
+
+/**
+ * kstrdup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strlen(s) + 1;
+ buf = kmalloc_track_caller(len, gfp);
+ if (buf)
+ memcpy(buf, s, len);
+ return buf;
+}
+EXPORT_SYMBOL(kstrdup);
+
+/**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
+ *
+ * Return: source string if it is in .rodata section otherwise
+ * fallback to kstrdup.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+ if (is_kernel_rodata((unsigned long)s))
+ return s;
+
+ return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+
+/**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Note: Use kmemdup_nul() instead if the size is known exactly.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strnlen(s, max);
+ buf = kmalloc_track_caller(len+1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kmalloc_track_caller(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
+/**
+ * kmemdup_nul - Create a NUL-terminated string from unterminated data
+ * @s: The data to stringify
+ * @len: The size of the data
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
+ * case of error
+ */
+char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
+{
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kmemdup_nul);
+
+/**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Return: an ERR_PTR() on failure. Result is physically
+ * contiguous, to be freed by kfree().
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+ void *p;
+
+ p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return p;
+}
+EXPORT_SYMBOL(memdup_user);
+
+/**
+ * vmemdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Return: an ERR_PTR() on failure. Result may be not
+ * physically contiguous. Use kvfree() to free.
+ */
+void *vmemdup_user(const void __user *src, size_t len)
+{
+ void *p;
+
+ p = kvmalloc(len, GFP_USER);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kvfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return p;
+}
+EXPORT_SYMBOL(vmemdup_user);
+
+/**
+ * strndup_user - duplicate an existing string from user space
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Return: newly allocated copy of @s or an ERR_PTR() in case of error
+ */
+char *strndup_user(const char __user *s, long n)
+{
+ char *p;
+ long length;
+
+ length = strnlen_user(s, n);
+
+ if (!length)
+ return ERR_PTR(-EFAULT);
+
+ if (length > n)
+ return ERR_PTR(-EINVAL);
+
+ p = memdup_user(s, length);
+
+ if (IS_ERR(p))
+ return p;
+
+ p[length - 1] = '\0';
+
+ return p;
+}
+EXPORT_SYMBOL(strndup_user);
+
+/**
+ * memdup_user_nul - duplicate memory region from user space and NUL-terminate
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Return: an ERR_PTR() on failure.
+ */
+void *memdup_user_nul(const void __user *src, size_t len)
+{
+ char *p;
+
+ /*
+ * Always use GFP_KERNEL, since copy_from_user() can sleep and
+ * cause pagefault, which makes it pointless to use GFP_NOFS
+ * or GFP_ATOMIC.
+ */
+ p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+ p[len] = '\0';
+
+ return p;
+}
+EXPORT_SYMBOL(memdup_user_nul);
+
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev)
+{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
+ if (prev) {
+ next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ next = mm->mmap;
+ mm->mmap = vma;
+ }
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
+}
+
+void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ struct vm_area_struct *prev, *next;
+
+ next = vma->vm_next;
+ prev = vma->vm_prev;
+ if (prev)
+ prev->vm_next = next;
+ else
+ mm->mmap = next;
+ if (next)
+ next->vm_prev = prev;
+}
+
+/* Check if the vma is being used as a stack by this task */
+int vma_is_stack_for_current(struct vm_area_struct *vma)
+{
+ struct task_struct * __maybe_unused t = current;
+
+ return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
+}
+
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+ unsigned long random_variable = 0;
+
+ if (current->flags & PF_RANDOMIZE) {
+ random_variable = get_random_long();
+ random_variable &= STACK_RND_MASK;
+ random_variable <<= PAGE_SHIFT;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ return PAGE_ALIGN(stack_top) + random_variable;
+#else
+ return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /* Is the current task 32bit ? */
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ if (is_compat_task())
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP (SZ_128M)
+#define MAX_GAP (STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_guard_gap;
+
+ /* Account for stack randomization if necessary */
+ if (current->flags & PF_RANDOMIZE)
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+}
+#endif
+
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ * @task: task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_lock is held as writer.
+ *
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+ struct task_struct *task, bool bypass_rlim)
+{
+ unsigned long locked_vm, limit;
+ int ret = 0;
+
+ mmap_assert_write_locked(mm);
+
+ locked_vm = mm->locked_vm;
+ if (inc) {
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
+ }
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+ } else {
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+ }
+
+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+ ret ? " - exceeded" : "");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+ int ret;
+
+ if (pages == 0 || !mm)
+ return 0;
+
+ mmap_write_lock(mm);
+ ret = __account_locked_vm(mm, pages, inc, current,
+ capable(CAP_IPC_LOCK));
+ mmap_write_unlock(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff)
+{
+ unsigned long ret;
+ struct mm_struct *mm = current->mm;
+ unsigned long populate;
+ LIST_HEAD(uf);
+
+ ret = security_mmap_file(file, prot, flag);
+ if (!ret) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ &uf);
+ mmap_write_unlock(mm);
+ userfaultfd_unmap_complete(mm, &uf);
+ if (populate)
+ mm_populate(ret, populate);
+ }
+ return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ if (unlikely(offset + PAGE_ALIGN(len) < offset))
+ return -EINVAL;
+ if (unlikely(offset_in_page(offset)))
+ return -EINVAL;
+
+ return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
+ * fall back to vmalloc.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+ gfp_t kmalloc_flags = flags;
+ void *ret;
+
+ /*
+ * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+ * so the given set of flags has to be compatible.
+ */
+ if ((flags & GFP_KERNEL) != GFP_KERNEL)
+ return kmalloc_node(size, flags, node);
+
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ kmalloc_flags |= __GFP_NOWARN;
+
+ if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
+ kmalloc_flags |= __GFP_NORETRY;
+ }
+
+ ret = kmalloc_node(size, kmalloc_flags, node);
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ return __vmalloc_node(size, 1, flags, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+{
+ void *newp;
+
+ if (oldsize >= newsize)
+ return (void *)p;
+ newp = kvmalloc(newsize, flags);
+ if (!newp)
+ return NULL;
+ memcpy(newp, p, oldsize);
+ kvfree(p);
+ return newp;
+}
+EXPORT_SYMBOL(kvrealloc);
+
+static inline void *__page_rmapping(struct page *page)
+{
+ unsigned long mapping;
+
+ mapping = (unsigned long)page->mapping;
+ mapping &= ~PAGE_MAPPING_FLAGS;
+
+ return (void *)mapping;
+}
+
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+ page = compound_head(page);
+ return __page_rmapping(page);
+}
+
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+ int i;
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) >= 0;
+ page = compound_head(page);
+ if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ return true;
+ if (PageHuge(page))
+ return false;
+ for (i = 0; i < compound_nr(page); i++) {
+ if (atomic_read(&page[i]._mapcount) >= 0)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
+struct anon_vma *page_anon_vma(struct page *page)
+{
+ unsigned long mapping;
+
+ page = compound_head(page);
+ mapping = (unsigned long)page->mapping;
+ if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ return NULL;
+ return __page_rmapping(page);
+}
+
+struct address_space *page_mapping(struct page *page)
+{
+ struct address_space *mapping;
+
+ page = compound_head(page);
+
+ /* This happens if someone calls flush_dcache_page on slab page */
+ if (unlikely(PageSlab(page)))
+ return NULL;
+
+ if (unlikely(PageSwapCache(page))) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ return swap_address_space(entry);
+ }
+
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+ return NULL;
+
+ return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
+}
+EXPORT_SYMBOL(page_mapping);
+
+/*
+ * For file cache pages, return the address_space, otherwise return NULL
+ */
+struct address_space *page_mapping_file(struct page *page)
+{
+ if (unlikely(PageSwapCache(page)))
+ return NULL;
+ return page_mapping(page);
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+ int ret;
+
+ ret = atomic_read(&page->_mapcount) + 1;
+ /*
+ * For file THP page->_mapcount contains total number of mapping
+ * of the page: no need to look into compound_mapcount.
+ */
+ if (!PageAnon(page) && !PageHuge(page))
+ return ret;
+ page = compound_head(page);
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__page_mapcount);
+
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+int sysctl_overcommit_ratio __read_mostly = 50;
+unsigned long sysctl_overcommit_kbytes __read_mostly;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_kbytes = 0;
+ return ret;
+}
+
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+ percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int new_policy = -1;
+ int ret;
+
+ /*
+ * The deviation of sync_overcommit_as could be big with loose policy
+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+ * with the strict "NEVER", and to avoid possible race condtion (even
+ * though user usually won't too frequently do the switching to policy
+ * OVERCOMMIT_NEVER), the switch is done in the following order:
+ * 1. changing the batch
+ * 2. sync percpu count on each CPU
+ * 3. switch the policy
+ */
+ if (write) {
+ t = *table;
+ t.data = &new_policy;
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || new_policy == -1)
+ return ret;
+
+ mm_compute_batch(new_policy);
+ if (new_policy == OVERCOMMIT_NEVER)
+ schedule_on_each_cpu(sync_overcommit_as);
+ sysctl_overcommit_memory = new_policy;
+ } else {
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_ratio = 0;
+ return ret;
+}
+
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+ unsigned long allowed;
+
+ if (sysctl_overcommit_kbytes)
+ allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+ else
+ allowed = ((totalram_pages() - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100);
+ allowed += total_swap_pages;
+
+ return allowed;
+}
+
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ *
+ * The time cost of this is very low for small platforms, and for big
+ * platform like a 2S/36C/72T Skylake server, in worst case where
+ * vm_committed_as's spinlock is under severe contention, the time cost
+ * could be about 30~40 microseconds.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_sum_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ long allowed;
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ if (pages > totalram_pages() + total_swap_pages)
+ goto error;
+ return 0;
+ }
+
+ allowed = vm_commit_limit();
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
+/**
+ * get_cmdline() - copy the cmdline value to a buffer.
+ * @task: the task whose cmdline value to copy.
+ * @buffer: the buffer to copy to.
+ * @buflen: the length of the buffer. Larger cmdline values are truncated
+ * to this length.
+ *
+ * Return: the size of the cmdline field copied. Note that the copy does
+ * not guarantee an ending NULL byte.
+ */
+int get_cmdline(struct task_struct *task, char *buffer, int buflen)
+{
+ int res = 0;
+ unsigned int len;
+ struct mm_struct *mm = get_task_mm(task);
+ unsigned long arg_start, arg_end, env_start, env_end;
+ if (!mm)
+ goto out;
+ if (!mm->arg_end)
+ goto out_mm; /* Shh! No looking before we're done */
+
+ spin_lock(&mm->arg_lock);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ spin_unlock(&mm->arg_lock);
+
+ len = arg_end - arg_start;
+
+ if (len > buflen)
+ len = buflen;
+
+ res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
+
+ /*
+ * If the nul at the end of args has been overwritten, then
+ * assume application is using setproctitle(3).
+ */
+ if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
+ len = strnlen(buffer, res);
+ if (len < res) {
+ res = len;
+ } else {
+ len = env_end - env_start;
+ if (len > buflen - res)
+ len = buflen - res;
+ res += access_process_vm(task, env_start,
+ buffer+res, len,
+ FOLL_FORCE);
+ res = strnlen(buffer, res);
+ }
+ }
+out_mm:
+ mmput(mm);
+out:
+ return res;
+}
+
+int __weak memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000..01a6e6688
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Hash based on the pmd of addr if configured with MMU, which provides a good
+ * hit rate for workloads with spatial locality. Otherwise, use pages.
+ */
+#ifdef CONFIG_MMU
+#define VMACACHE_SHIFT PMD_SHIFT
+#else
+#define VMACACHE_SHIFT PAGE_SHIFT
+#endif
+#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via
+ * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
+ */
+static inline bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache.seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int idx = VMACACHE_HASH(addr);
+ int i;
+
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
+
+ if (vma) {
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+#endif
+ if (vma->vm_start <= addr && vma->vm_end > addr) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
+ }
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int idx = VMACACHE_HASH(start);
+ int i;
+
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
new file mode 100644
index 000000000..d6a4794fa
--- /dev/null
+++ b/mm/vmalloc.c
@@ -0,0 +1,3589 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1993 Linus Torvalds
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
+ * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
+ * Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/set_memory.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/rbtree.h>
+#include <linux/xarray.h>
+#include <linux/rcupdate.h>
+#include <linux/pfn.h>
+#include <linux/kmemleak.h>
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/llist.h>
+#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/overflow.h>
+
+#include <linux/uaccess.h>
+#include <asm/tlbflush.h>
+#include <asm/shmparam.h>
+
+#include "internal.h"
+#include "pgalloc-track.h"
+
+bool is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)x;
+
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+EXPORT_SYMBOL(is_vmalloc_addr);
+
+struct vfree_deferred {
+ struct llist_head list;
+ struct work_struct wq;
+};
+static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
+
+static void __vunmap(const void *, int);
+
+static void free_work(struct work_struct *w)
+{
+ struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+ struct llist_node *t, *llnode;
+
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ __vunmap((void *)llnode, 1);
+}
+
+/*** Page table manipulation functions ***/
+
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+ WARN_ON(!pte_none(ptent) && !pte_present(ptent));
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+}
+
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int cleared;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared)
+ continue;
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ vunmap_pte_range(pmd, addr, next, mask);
+
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+ int cleared;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared)
+ continue;
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ vunmap_pmd_range(pud, addr, next, mask);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int cleared;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ cleared = p4d_clear_huge(p4d);
+ if (cleared || p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (cleared)
+ continue;
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ vunmap_pud_range(p4d, addr, next, mask);
+ } while (p4d++, addr = next, addr != end);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @start: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+{
+ unsigned long end = start + size;
+ unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ vunmap_p4d_range(pgd, addr, next, &mask);
+ } while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+}
+
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+
+ /*
+ * nr is a running index into the array which helps higher level
+ * callers keep track of where we're up to.
+ */
+
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ struct page *page = pages[*nr];
+
+ if (WARN_ON(!pte_none(*pte)))
+ return -EBUSY;
+ if (WARN_ON(!page))
+ return -ENOMEM;
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
+ (*nr)++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ return -ENOMEM;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
+{
+ unsigned long start = addr;
+ unsigned long end = addr + size;
+ unsigned long next;
+ pgd_t *pgd;
+ int err = 0;
+ int nr = 0;
+ pgtbl_mod_mask mask = 0;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ if (err)
+ return err;
+ } while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return 0;
+}
+
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages)
+{
+ int ret;
+
+ ret = map_kernel_range_noflush(start, size, prot, pages);
+ flush_cache_vmap(start, start + size);
+ return ret;
+}
+
+int is_vmalloc_or_module_addr(const void *x)
+{
+ /*
+ * ARM, x86-64 and sparc64 put modules in a special place,
+ * and fall back on vmalloc() if that fails. Others
+ * just put it in the vmalloc space.
+ */
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ unsigned long addr = (unsigned long)x;
+ if (addr >= MODULES_VADDR && addr < MODULES_END)
+ return 1;
+#endif
+ return is_vmalloc_addr(x);
+}
+
+/*
+ * Walk a vmap address to the struct page it maps.
+ */
+struct page *vmalloc_to_page(const void *vmalloc_addr)
+{
+ unsigned long addr = (unsigned long) vmalloc_addr;
+ struct page *page = NULL;
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+
+ /*
+ * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+ * architectures that do not vmalloc module space
+ */
+ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
+
+ if (pgd_none(*pgd))
+ return NULL;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return NULL;
+ pud = pud_offset(p4d, addr);
+
+ /*
+ * Don't dereference bad PUD or PMD (below) entries. This will also
+ * identify huge mappings, which we may encounter on architectures
+ * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
+ * identified as vmalloc addresses by is_vmalloc_addr(), but are
+ * not [unambiguously] associated with a struct page, so there is
+ * no correct value to return for them.
+ */
+ WARN_ON_ONCE(pud_bad(*pud));
+ if (pud_none(*pud) || pud_bad(*pud))
+ return NULL;
+ pmd = pmd_offset(pud, addr);
+ WARN_ON_ONCE(pmd_bad(*pmd));
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ return NULL;
+
+ ptep = pte_offset_map(pmd, addr);
+ pte = *ptep;
+ if (pte_present(pte))
+ page = pte_page(pte);
+ pte_unmap(ptep);
+ return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+{
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
+
+
+/*** Global kva allocator ***/
+
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
+
+static DEFINE_SPINLOCK(vmap_area_lock);
+static DEFINE_SPINLOCK(free_vmap_area_lock);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+static bool vmap_initialized __read_mostly;
+
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
+
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+/*
+ * Preload a CPU with one object for "no edge" split case. The
+ * aim is to get rid of allocations from the atomic context, thus
+ * to use more permissive allocation masks.
+ */
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
+
+static void purge_vmap_area_lazy(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static unsigned long lazy_max_pages(void);
+
+static atomic_long_t nr_vmalloc_pages;
+
+unsigned long vmalloc_nr_pages(void)
+{
+ return atomic_long_read(&nr_vmalloc_pages);
+}
+
+static struct vmap_area *__find_vmap_area(unsigned long addr)
+{
+ struct rb_node *n = vmap_area_root.rb_node;
+
+ while (n) {
+ struct vmap_area *va;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (addr < va->va_start)
+ n = n->rb_left;
+ else if (addr >= va->va_end)
+ n = n->rb_right;
+ else
+ return va;
+ }
+
+ return NULL;
+}
+
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
+{
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
+
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
+ }
+
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
+
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_start < tmp_va->va_end &&
+ va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_end > tmp_va->va_start &&
+ va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
+}
+
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
+
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
+ }
+
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (root == &free_vmap_area_root) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because of we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
+
+ /* Address-sort this list */
+ list_add(&va->list, head);
+}
+
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
+ return;
+
+ if (root == &free_vmap_area_root)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
+
+ list_del(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+}
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+static void
+augment_tree_propagate_check(void)
+{
+ struct vmap_area *va;
+ unsigned long computed_size;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+}
+#endif
+
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check();
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ if (link)
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
+ */
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
+
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
+ if (merged)
+ unlink_va(va, root);
+
+ sibling->va_end = va->va_end;
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+insert:
+ if (!merged)
+ link_va(va, root, parent, link, head);
+
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
+ return va;
+}
+
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
+}
+
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = free_vmap_area_root.rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = size + align - 1;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
+ } else {
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the first right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * only once due to "vstart" restriction.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ node = node->rb_right;
+ break;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(unsigned long size)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+adjust_va_to_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size,
+ enum fit_type type)
+{
+ struct vmap_area *lva = NULL;
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va(va, &free_vmap_area_root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
+ if (unlikely(!lva)) {
+ /*
+ * For percpu allocator we do not do any pre-allocation
+ * and leave it as it is. The reason is it most likely
+ * never ends up with NE_FIT_TYPE splitting. In case of
+ * percpu allocations offsets and sizes are aligned to
+ * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
+ * are its main fitting cases.
+ *
+ * There are a few exceptions though, as an example it is
+ * a first allocation (early boot up) when we have "one"
+ * big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!lva)
+ return -1;
+ }
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -1;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (lva) /* type == NE_FIT_TYPE */
+ insert_vmap_area_augment(lva, &va->rb_node,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise a vend is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend)
+{
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
+ enum fit_type type;
+ int ret;
+
+ va = find_vmap_lowest_match(size, align, vstart);
+ if (unlikely(!va))
+ return vend;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Classify what we have found. */
+ type = classify_va_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ if (ret)
+ return vend;
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ find_vmap_lowest_match_check(size);
+#endif
+
+ return nva_start_addr;
+}
+
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+ /*
+ * Remove from the busy tree/list.
+ */
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
+}
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+ unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ int node, gfp_t gfp_mask)
+{
+ struct vmap_area *va, *pva;
+ unsigned long addr;
+ int purged = 0;
+ int ret;
+
+ BUG_ON(!size);
+ BUG_ON(offset_in_page(size));
+ BUG_ON(!is_power_of_2(align));
+
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
+ might_sleep();
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+
+retry:
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
+ *
+ * The preload is done in non-atomic context, thus it allows us
+ * to use more permissive allocation masks to be more stable under
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
+ *
+ * Set "pva" to NULL here, because of "retry" path.
+ */
+ pva = NULL;
+
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(&free_vmap_area_lock);
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
+
+ /*
+ * If an allocation fails, the "vend" address is
+ * returned. Therefore trigger the overflow path.
+ */
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
+ if (unlikely(addr == vend))
+ goto overflow;
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->vm = NULL;
+
+
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ BUG_ON(!IS_ALIGNED(va->va_start, align));
+ BUG_ON(va->va_start < vstart);
+ BUG_ON(va->va_end > vend);
+
+ ret = kasan_populate_vmalloc(addr, size);
+ if (ret) {
+ free_vmap_area(va);
+ return ERR_PTR(ret);
+ }
+
+ return va;
+
+overflow:
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = 1;
+ goto retry;
+ }
+
+ if (gfpflags_allow_blocking(gfp_mask)) {
+ unsigned long freed = 0;
+ blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+ if (freed > 0) {
+ purged = 0;
+ goto retry;
+ }
+ }
+
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
+ pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+ size);
+
+ kmem_cache_free(vmap_area_cachep, va);
+ return ERR_PTR(-EBUSY);
+}
+
+int register_vmap_purge_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
+
+int unregister_vmap_purge_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
+
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+ unsigned int log;
+
+ log = fls(num_online_cpus());
+
+ return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
+
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
+
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
+
+/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
+ * Purges all lazily-freed vmap areas.
+ */
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+{
+ unsigned long resched_threshold;
+ struct llist_node *valist;
+ struct vmap_area *va;
+ struct vmap_area *n_va;
+
+ lockdep_assert_held(&vmap_purge_lock);
+
+ valist = llist_del_all(&vmap_purge_list);
+ if (unlikely(valist == NULL))
+ return false;
+
+ /*
+ * TODO: to calculate a flush range without looping.
+ * The list can be up to lazy_max_pages() elements.
+ */
+ llist_for_each_entry(va, valist, purge_list) {
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
+ }
+
+ flush_tlb_kernel_range(start, end);
+ resched_threshold = lazy_max_pages() << 1;
+
+ spin_lock(&free_vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long orig_start = va->va_start;
+ unsigned long orig_end = va->va_end;
+
+ /*
+ * Finally insert or merge lazily-freed area. It is
+ * detached and there is no need to "unlink" it from
+ * anything.
+ */
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
+ &free_vmap_area_list);
+
+ if (!va)
+ continue;
+
+ if (is_vmalloc_or_module_addr((void *)orig_start))
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
+
+ atomic_long_sub(nr, &vmap_lazy_nr);
+
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+ cond_resched_lock(&free_vmap_area_lock);
+ }
+ spin_unlock(&free_vmap_area_lock);
+ return true;
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+ if (mutex_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+ }
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+}
+
+/*
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
+ */
+static void free_vmap_area_noflush(struct vmap_area *va)
+{
+ unsigned long nr_lazy;
+
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+ PAGE_SHIFT, &vmap_lazy_nr);
+
+ /* After this point, we may free va at any time */
+ llist_add(&va->purge_list, &vmap_purge_list);
+
+ if (unlikely(nr_lazy > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
+}
+
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+ flush_cache_vunmap(va->va_start, va->va_end);
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+
+ free_vmap_area_noflush(va);
+}
+
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
+/*** Per cpu kva allocator ***/
+
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE (128UL*1024*1024)
+#else
+#define VMALLOC_SPACE (128UL*1024*1024*1024)
+#endif
+
+#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS \
+ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
+ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
+ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
+
+#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+
+struct vmap_block_queue {
+ spinlock_t lock;
+ struct list_head free;
+};
+
+struct vmap_block {
+ spinlock_t lock;
+ struct vmap_area *va;
+ unsigned long free, dirty;
+ unsigned long dirty_min, dirty_max; /*< dirty range */
+ struct list_head free_list;
+ struct rcu_head rcu_head;
+ struct list_head purge;
+};
+
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+
+/*
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_XARRAY(vmap_blocks);
+
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+ addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+ addr /= VMAP_BLOCK_SIZE;
+ return addr;
+}
+
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+ unsigned long addr;
+
+ addr = va_start + (pages_off << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+ return (void *)addr;
+}
+
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ * block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order: how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ struct vmap_area *va;
+ unsigned long vb_idx;
+ int node, err;
+ void *vaddr;
+
+ node = numa_node_id();
+
+ vb = kmalloc_node(sizeof(struct vmap_block),
+ gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!vb))
+ return ERR_PTR(-ENOMEM);
+
+ va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ node, gfp_mask);
+ if (IS_ERR(va)) {
+ kfree(vb);
+ return ERR_CAST(va);
+ }
+
+ vaddr = vmap_block_vaddr(va->va_start, 0);
+ spin_lock_init(&vb->lock);
+ vb->va = va;
+ /* At least something should be left free */
+ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ vb->free = VMAP_BBMAP_BITS - (1UL << order);
+ vb->dirty = 0;
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+ INIT_LIST_HEAD(&vb->free_list);
+
+ vb_idx = addr_to_vb_idx(va->va_start);
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
+
+ vbq = &get_cpu_var(vmap_block_queue);
+ spin_lock(&vbq->lock);
+ list_add_tail_rcu(&vb->free_list, &vbq->free);
+ spin_unlock(&vbq->lock);
+ put_cpu_var(vmap_block_queue);
+
+ return vaddr;
+}
+
+static void free_vmap_block(struct vmap_block *vb)
+{
+ struct vmap_block *tmp;
+
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
+ BUG_ON(tmp != vb);
+
+ free_vmap_area_noflush(vb->va);
+ kfree_rcu(vb, rcu_head);
+}
+
+static void purge_fragmented_blocks(int cpu)
+{
+ LIST_HEAD(purge);
+ struct vmap_block *vb;
+ struct vmap_block *n_vb;
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+
+ if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ continue;
+
+ spin_lock(&vb->lock);
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+ vb->free = 0; /* prevent further allocs after releasing lock */
+ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ spin_unlock(&vb->lock);
+ list_add_tail(&vb->purge, &purge);
+ } else
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
+static void purge_fragmented_blocks_allcpus(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ purge_fragmented_blocks(cpu);
+}
+
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ void *vaddr = NULL;
+ unsigned int order;
+
+ BUG_ON(offset_in_page(size));
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+ if (WARN_ON(size == 0)) {
+ /*
+ * Allocating 0 bytes isn't what caller wants since
+ * get_order(0) returns funny result. Just warn and terminate
+ * early.
+ */
+ return NULL;
+ }
+ order = get_order(size);
+
+ rcu_read_lock();
+ vbq = &get_cpu_var(vmap_block_queue);
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ unsigned long pages_off;
+
+ spin_lock(&vb->lock);
+ if (vb->free < (1UL << order)) {
+ spin_unlock(&vb->lock);
+ continue;
+ }
+
+ pages_off = VMAP_BBMAP_BITS - vb->free;
+ vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ }
+
+ spin_unlock(&vb->lock);
+ break;
+ }
+
+ put_cpu_var(vmap_block_queue);
+ rcu_read_unlock();
+
+ /* Allocate new block if nothing was found */
+ if (!vaddr)
+ vaddr = new_vmap_block(order, gfp_mask);
+
+ return vaddr;
+}
+
+static void vb_free(unsigned long addr, unsigned long size)
+{
+ unsigned long offset;
+ unsigned int order;
+ struct vmap_block *vb;
+
+ BUG_ON(offset_in_page(size));
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+
+ flush_cache_vunmap(addr, addr + size);
+
+ order = get_order(size);
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
+
+ unmap_kernel_range_noflush(addr, size);
+
+ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range(addr, addr + size);
+
+ spin_lock(&vb->lock);
+
+ /* Expand dirty range */
+ vb->dirty_min = min(vb->dirty_min, offset);
+ vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
+
+ vb->dirty += 1UL << order;
+ if (vb->dirty == VMAP_BBMAP_BITS) {
+ BUG_ON(vb->free);
+ spin_unlock(&vb->lock);
+ free_vmap_block(vb);
+ } else
+ spin_unlock(&vb->lock);
+}
+
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
+{
+ int cpu;
+
+ if (unlikely(!vmap_initialized))
+ return;
+
+ might_sleep();
+
+ for_each_possible_cpu(cpu) {
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+ struct vmap_block *vb;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ spin_lock(&vb->lock);
+ if (vb->dirty) {
+ unsigned long va_start = vb->va->va_start;
+ unsigned long s, e;
+
+ s = va_start + (vb->dirty_min << PAGE_SHIFT);
+ e = va_start + (vb->dirty_max << PAGE_SHIFT);
+
+ start = min(s, start);
+ end = max(e, end);
+
+ flush = 1;
+ }
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+ }
+
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
+}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush = 0;
+
+ _vm_unmap_aliases(start, end, flush);
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+ unsigned long size = (unsigned long)count << PAGE_SHIFT;
+ unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
+
+ might_sleep();
+ BUG_ON(!addr);
+ BUG_ON(addr < VMALLOC_START);
+ BUG_ON(addr > VMALLOC_END);
+ BUG_ON(!PAGE_ALIGNED(addr));
+
+ kasan_poison_vmalloc(mem, size);
+
+ if (likely(count <= VMAP_MAX_ALLOC)) {
+ debug_check_no_locks_freed(mem, size);
+ vb_free(addr, size);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ debug_check_no_locks_freed((void *)va->va_start,
+ (va->va_end - va->va_start));
+ free_unmap_vmap_area(va);
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ *
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good. But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine). You could see failures in
+ * the end. Please use this function for short-lived objects.
+ *
+ * Returns: a pointer to the address that has been mapped, or %NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
+{
+ unsigned long size = (unsigned long)count << PAGE_SHIFT;
+ unsigned long addr;
+ void *mem;
+
+ if (likely(count <= VMAP_MAX_ALLOC)) {
+ mem = vb_alloc(size, GFP_KERNEL);
+ if (IS_ERR(mem))
+ return NULL;
+ addr = (unsigned long)mem;
+ } else {
+ struct vmap_area *va;
+ va = alloc_vmap_area(size, PAGE_SIZE,
+ VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ if (IS_ERR(va))
+ return NULL;
+
+ addr = va->va_start;
+ mem = (void *)addr;
+ }
+
+ kasan_unpoison_vmalloc(mem, size);
+
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ vm_unmap_ram(mem, count);
+ return NULL;
+ }
+ return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+static struct vm_struct *vmlist __initdata;
+
+/**
+ * vm_area_add_early - add vmap area early during boot
+ * @vm: vm_struct to add
+ *
+ * This function is used to add fixed kernel vm area to vmlist before
+ * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
+ * should contain proper values and the other fields should be zero.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_add_early(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+
+ BUG_ON(vmap_initialized);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr) {
+ BUG_ON(tmp->addr < vm->addr + vm->size);
+ break;
+ } else
+ BUG_ON(tmp->addr + tmp->size > vm->addr);
+ }
+ vm->next = *p;
+ *p = vm;
+}
+
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @align: requested alignment
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called. @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero. On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
+{
+ static size_t vm_init_off __initdata;
+ unsigned long addr;
+
+ addr = ALIGN(VMALLOC_START + vm_init_off, align);
+ vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+
+ vm->addr = (void *)addr;
+
+ vm_area_add_early(vm);
+}
+
+static void vmap_init_free_space(void)
+{
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *busy, *free;
+
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ list_for_each_entry(busy, &vmap_area_list, list) {
+ if (busy->va_start - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = busy->va_start;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = busy->va_end;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+}
+
+void __init vmalloc_init(void)
+{
+ struct vmap_area *va;
+ struct vm_struct *tmp;
+ int i;
+
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+ struct vfree_deferred *p;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ p = &per_cpu(vfree_deferred, i);
+ init_llist_head(&p->list);
+ INIT_WORK(&p->wq, free_work);
+ }
+
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ }
+
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
+ vmap_initialized = true;
+}
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+ unsigned long end = addr + size;
+
+ flush_cache_vunmap(addr, end);
+ unmap_kernel_range_noflush(addr, size);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
+{
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->vm = vm;
+}
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
+ spin_unlock(&vmap_area_lock);
+}
+
+static void clear_vm_uninitialized_flag(struct vm_struct *vm)
+{
+ /*
+ * Before removing VM_UNINITIALIZED,
+ * we should make sure that vm has proper values.
+ * Pair with smp_rmb() in show_numa_info().
+ */
+ smp_wmb();
+ vm->flags &= ~VM_UNINITIALIZED;
+}
+
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long align, unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+{
+ struct vmap_area *va;
+ struct vm_struct *area;
+ unsigned long requested_size = size;
+
+ BUG_ON(in_interrupt());
+ size = PAGE_ALIGN(size);
+ if (unlikely(!size))
+ return NULL;
+
+ if (flags & VM_IOREMAP)
+ align = 1ul << clamp_t(int, get_count_order_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
+
+ area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!area))
+ return NULL;
+
+ if (!(flags & VM_NO_GUARD))
+ size += PAGE_SIZE;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ if (IS_ERR(va)) {
+ kfree(area);
+ return NULL;
+ }
+
+ kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
+
+ setup_vmalloc_vm(area, va, flags, caller);
+
+ return area;
+}
+
+struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end,
+ const void *caller)
+{
+ return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
+ GFP_KERNEL, caller);
+}
+
+/**
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ *
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
+ */
+struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+{
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL,
+ __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+ const void *caller)
+{
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
+}
+
+/**
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
+ */
+struct vm_struct *find_vm_area(const void *addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area((unsigned long)addr);
+ if (!va)
+ return NULL;
+
+ return va->vm;
+}
+
+/**
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
+ */
+struct vm_struct *remove_vm_area(const void *addr)
+{
+ struct vmap_area *va;
+
+ might_sleep();
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr);
+ if (va && va->vm) {
+ struct vm_struct *vm = va->vm;
+
+ va->vm = NULL;
+ spin_unlock(&vmap_area_lock);
+
+ kasan_free_shadow(vm);
+ free_unmap_vmap_area(va);
+
+ return vm;
+ }
+
+ spin_unlock(&vmap_area_lock);
+ return NULL;
+}
+
+static inline void set_area_direct_map(const struct vm_struct *area,
+ int (*set_direct_map)(struct page *page))
+{
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++)
+ if (page_address(area->pages[i]))
+ set_direct_map(area->pages[i]);
+}
+
+/* Handle removing and resetting vm mappings related to the vm_struct. */
+static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
+ int flush_dmap = 0;
+ int i;
+
+ remove_vm_area(area->addr);
+
+ /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
+ if (!flush_reset)
+ return;
+
+ /*
+ * If not deallocating pages, just do the flush of the VM area and
+ * return.
+ */
+ if (!deallocate_pages) {
+ vm_unmap_aliases();
+ return;
+ }
+
+ /*
+ * If execution gets here, flush the vm mapping and reset the direct
+ * map. Find the start and end range of the direct mappings to make sure
+ * the vm_unmap_aliases() flush includes the direct map.
+ */
+ for (i = 0; i < area->nr_pages; i++) {
+ unsigned long addr = (unsigned long)page_address(area->pages[i]);
+ if (addr) {
+ start = min(addr, start);
+ end = max(addr + PAGE_SIZE, end);
+ flush_dmap = 1;
+ }
+ }
+
+ /*
+ * Set direct map to something invalid so that it won't be cached if
+ * there are any accesses after the TLB flush, then flush the TLB and
+ * reset the direct map permissions to the default.
+ */
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
+ _vm_unmap_aliases(start, end, flush_dmap);
+ set_area_direct_map(area, set_direct_map_default_noflush);
+}
+
+static void __vunmap(const void *addr, int deallocate_pages)
+{
+ struct vm_struct *area;
+
+ if (!addr)
+ return;
+
+ if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+ addr))
+ return;
+
+ area = find_vm_area(addr);
+ if (unlikely(!area)) {
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ addr);
+ return;
+ }
+
+ debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
+ debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+
+ kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
+
+ vm_remove_mappings(area, deallocate_pages);
+
+ if (deallocate_pages) {
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
+
+ BUG_ON(!page);
+ __free_pages(page, 0);
+ }
+ atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
+
+ kvfree(area->pages);
+ }
+
+ kfree(area);
+ return;
+}
+
+static inline void __vfree_deferred(const void *addr)
+{
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * another cpu's list. schedule_work() should be fine with this too.
+ */
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+}
+
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ __vfree_deferred(addr);
+}
+
+static void __vfree(const void *addr)
+{
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
+ __vunmap(addr, 1);
+}
+
+/**
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
+ *
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
+ *
+ * If @addr is NULL, no operation is performed.
+ *
+ * Context:
+ * May sleep if called *not* from interrupt context.
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea).
+ */
+void vfree(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ might_sleep_if(!in_interrupt());
+
+ if (!addr)
+ return;
+
+ __vfree(addr);
+}
+EXPORT_SYMBOL(vfree);
+
+/**
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
+ *
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
+ *
+ * Must not be called in interrupt context.
+ */
+void vunmap(const void *addr)
+{
+ BUG_ON(in_interrupt());
+ might_sleep();
+ if (addr)
+ __vunmap(addr, 0);
+}
+EXPORT_SYMBOL(vunmap);
+
+/**
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
+ *
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
+ *
+ * Return: the address of the area or %NULL on failure
+ */
+void *vmap(struct page **pages, unsigned int count,
+ unsigned long flags, pgprot_t prot)
+{
+ struct vm_struct *area;
+ unsigned long size; /* In bytes */
+
+ might_sleep();
+
+ if (count > totalram_pages())
+ return NULL;
+
+ size = (unsigned long)count << PAGE_SHIFT;
+ area = get_vm_area_caller(size, flags, __builtin_return_address(0));
+ if (!area)
+ return NULL;
+
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
+ pages) < 0) {
+ vunmap(area->addr);
+ return NULL;
+ }
+
+ if (flags & VM_MAP_PUT_PAGES) {
+ area->pages = pages;
+ area->nr_pages = count;
+ }
+ return area->addr;
+}
+EXPORT_SYMBOL(vmap);
+
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ return -EINVAL;
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ flush_cache_vmap((unsigned long)area->addr,
+ (unsigned long)area->addr + count * PAGE_SIZE);
+
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node)
+{
+ const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ struct page **pages;
+
+ gfp_mask |= __GFP_NOWARN;
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ /* Please note that the recursion is strictly bounded. */
+ if (array_size > PAGE_SIZE) {
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
+ } else {
+ pages = kmalloc_node(array_size, nested_gfp, node);
+ }
+
+ if (!pages) {
+ remove_vm_area(area->addr);
+ kfree(area);
+ return NULL;
+ }
+
+ area->pages = pages;
+ area->nr_pages = nr_pages;
+
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page;
+
+ if (node == NUMA_NO_NODE)
+ page = alloc_page(gfp_mask);
+ else
+ page = alloc_pages_node(node, gfp_mask, 0);
+
+ if (unlikely(!page)) {
+ /* Successfully allocated i pages, free them in __vfree() */
+ area->nr_pages = i;
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ goto fail;
+ }
+ area->pages[i] = page;
+ if (gfpflags_allow_blocking(gfp_mask))
+ cond_resched();
+ }
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+ prot, pages) < 0)
+ goto fail;
+
+ return area->addr;
+
+fail:
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc: allocation failure, allocated %ld of %ld bytes",
+ (area->nr_pages*PAGE_SIZE), area->size);
+ __vfree(area->addr);
+ return NULL;
+}
+
+/**
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ *
+ * Return: the address of the area or %NULL on failure
+ */
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
+{
+ struct vm_struct *area;
+ void *addr;
+ unsigned long real_size = size;
+
+ size = PAGE_ALIGN(size);
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages())
+ goto fail;
+
+ area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+ vm_flags, start, end, node, gfp_mask, caller);
+ if (!area)
+ goto fail;
+
+ addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ if (!addr)
+ return NULL;
+
+ /*
+ * In this function, newly allocated vm_struct has VM_UNINITIALIZED
+ * flag. It means that vm_struct is not fully initialized.
+ * Now, it is fully initialized, so remove this flag here.
+ */
+ clear_vm_uninitialized_flag(area);
+
+ kmemleak_vmalloc(area, size, gfp_mask);
+
+ return addr;
+
+fail:
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc: allocation failure: %lu bytes", real_size);
+ return NULL;
+}
+
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
+ *
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
+ * and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
+ * with mm people.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
+}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
+
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+{
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__vmalloc);
+
+/**
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc(unsigned long size)
+{
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc);
+
+/**
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
+ * @size: allocation size
+ *
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_user(unsigned long size)
+{
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_node);
+
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vzalloc_node);
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
+#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
+#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
+#else
+/*
+ * 64b systems should always have either DMA or DMA32 zones. For others
+ * GFP_DMA32 should do the right thing and use the normal zone.
+ */
+#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#endif
+
+/**
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
+ *
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_32(unsigned long size)
+{
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_32);
+
+/**
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
+ * @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
+/*
+ * small helper routine , copy contents to buf from addr.
+ * If the page is not present, fill zero.
+ */
+
+static int aligned_vread(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = offset_in_page(addr);
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p);
+ memcpy(buf, map + offset, length);
+ kunmap_atomic(map);
+ } else
+ memset(buf, 0, length);
+
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+static int aligned_vwrite(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = offset_in_page(addr);
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p);
+ memcpy(map + offset, buf, length);
+ kunmap_atomic(map);
+ }
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+/**
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any information, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be increased
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
+ * include any intersection with valid vmalloc area
+ */
+long vread(char *buf, char *addr, unsigned long count)
+{
+ struct vmap_area *va;
+ struct vm_struct *vm;
+ char *vaddr, *buf_start = buf;
+ unsigned long buflen = count;
+ unsigned long n;
+
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!va->vm)
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + get_vm_area_size(vm))
+ continue;
+ while (addr < vaddr) {
+ if (count == 0)
+ goto finished;
+ *buf = '\0';
+ buf++;
+ addr++;
+ count--;
+ }
+ n = vaddr + get_vm_area_size(vm) - addr;
+ if (n > count)
+ n = count;
+ if (!(vm->flags & VM_IOREMAP))
+ aligned_vread(buf, addr, n);
+ else /* IOREMAP area is treated as memory hole */
+ memset(buf, 0, n);
+ buf += n;
+ addr += n;
+ count -= n;
+ }
+finished:
+ spin_unlock(&vmap_area_lock);
+
+ if (buf == buf_start)
+ return 0;
+ /* zero-fill memory holes */
+ if (buf != buf_start + buflen)
+ memset(buf, 0, buflen - (buf - buf_start));
+
+ return buflen;
+}
+
+/**
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any information, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be
+ * increased (same number as @count) or %0 if [addr...addr+count)
+ * doesn't include any intersection with valid vmalloc area
+ */
+long vwrite(char *buf, char *addr, unsigned long count)
+{
+ struct vmap_area *va;
+ struct vm_struct *vm;
+ char *vaddr;
+ unsigned long n, buflen;
+ int copied = 0;
+
+ /* Don't allow overflow */
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
+ buflen = count;
+
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!va->vm)
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + get_vm_area_size(vm))
+ continue;
+ while (addr < vaddr) {
+ if (count == 0)
+ goto finished;
+ buf++;
+ addr++;
+ count--;
+ }
+ n = vaddr + get_vm_area_size(vm) - addr;
+ if (n > count)
+ n = count;
+ if (!(vm->flags & VM_IOREMAP)) {
+ aligned_vwrite(buf, addr, n);
+ copied++;
+ }
+ buf += n;
+ addr += n;
+ count -= n;
+ }
+finished:
+ spin_unlock(&vmap_area_lock);
+ if (!copied)
+ return 0;
+ return buflen;
+}
+
+/**
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @pgoff: offset from @kaddr to start at
+ * @size: size of map area
+ *
+ * Returns: 0 for success, -Exxx on failure
+ *
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
+ *
+ * Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
+ void *kaddr, unsigned long pgoff,
+ unsigned long size)
+{
+ struct vm_struct *area;
+ unsigned long off;
+ unsigned long end_index;
+
+ if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
+ return -EINVAL;
+
+ size = PAGE_ALIGN(size);
+
+ if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
+ return -EINVAL;
+
+ area = find_vm_area(kaddr);
+ if (!area)
+ return -EINVAL;
+
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
+ return -EINVAL;
+
+ if (check_add_overflow(size, off, &end_index) ||
+ end_index > get_vm_area_size(area))
+ return -EINVAL;
+ kaddr += off;
+
+ do {
+ struct page *page = vmalloc_to_page(kaddr);
+ int ret;
+
+ ret = vm_insert_page(vma, uaddr, page);
+ if (ret)
+ return ret;
+
+ uaddr += PAGE_SIZE;
+ kaddr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ } while (size > 0);
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range_partial);
+
+/**
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
+ *
+ * Returns: 0 for success, -Exxx on failure
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
+ *
+ * Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
+{
+ return remap_vmalloc_range_partial(vma, vma->vm_start,
+ addr, pgoff,
+ vma->vm_end - vma->vm_start);
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
+void free_vm_area(struct vm_struct *area)
+{
+ struct vm_struct *ret;
+ ret = remove_vm_area(area->addr);
+ BUG_ON(ret != area);
+ kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
+#ifdef CONFIG_SMP
+static struct vmap_area *node_to_va(struct rb_node *n)
+{
+ return rb_entry_safe(n, struct vmap_area, rb_node);
+}
+
+/**
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
+ * @addr: target address
+ *
+ * Returns: vmap_area if it is found. If there is no such area
+ * the first highest(reverse order) vmap_area is returned
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
+ * if there are no any areas before @addr.
+ */
+static struct vmap_area *
+pvm_find_va_enclose_addr(unsigned long addr)
+{
+ struct vmap_area *va, *tmp;
+ struct rb_node *n;
+
+ n = free_vmap_area_root.rb_node;
+ va = NULL;
+
+ while (n) {
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_start <= addr) {
+ va = tmp;
+ if (tmp->va_end >= addr)
+ break;
+
+ n = n->rb_right;
+ } else {
+ n = n->rb_left;
+ }
+ }
+
+ return va;
+}
+
+/**
+ * pvm_determine_end_from_reverse - find the highest aligned address
+ * of free block below VMALLOC_END
+ * @va:
+ * in - the VA we start the search(reverse order);
+ * out - the VA with the highest aligned end address.
+ *
+ * Returns: determined end address within vmap_area
+ */
+static unsigned long
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
+{
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long addr;
+
+ if (likely(*va)) {
+ list_for_each_entry_from_reverse((*va),
+ &free_vmap_area_list, list) {
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
+ if ((*va)->va_start < addr)
+ return addr;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
+ * @offsets: array containing offset of each area
+ * @sizes: array containing size of each area
+ * @nr_vms: the number of areas to allocate
+ * @align: alignment, all entries in @offsets and @sizes must be aligned to this
+ *
+ * Returns: kmalloc'd vm_struct pointer array pointing to allocated
+ * vm_structs on success, %NULL on failure
+ *
+ * Percpu allocator wants to use congruent vm areas so that it can
+ * maintain the offsets among percpu areas. This function allocates
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
+ *
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans free blocks from the end looking
+ * for matching base. While scanning, if any of the areas do not fit the
+ * base address is pulled down to fit the area. Scanning is repeated till
+ * all the areas fit and then all necessary data structures are inserted
+ * and the result is returned.
+ */
+struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
+ const size_t *sizes, int nr_vms,
+ size_t align)
+{
+ const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
+ const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ struct vmap_area **vas, *va;
+ struct vm_struct **vms;
+ int area, area2, last_area, term_area;
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
+ bool purged = false;
+ enum fit_type type;
+
+ /* verify parameters and allocate data structures */
+ BUG_ON(offset_in_page(align) || !is_power_of_2(align));
+ for (last_area = 0, area = 0; area < nr_vms; area++) {
+ start = offsets[area];
+ end = start + sizes[area];
+
+ /* is everything aligned properly? */
+ BUG_ON(!IS_ALIGNED(offsets[area], align));
+ BUG_ON(!IS_ALIGNED(sizes[area], align));
+
+ /* detect the area with the highest address */
+ if (start > offsets[last_area])
+ last_area = area;
+
+ for (area2 = area + 1; area2 < nr_vms; area2++) {
+ unsigned long start2 = offsets[area2];
+ unsigned long end2 = start2 + sizes[area2];
+
+ BUG_ON(start2 < end && start < end2);
+ }
+ }
+ last_end = offsets[last_area] + sizes[last_area];
+
+ if (vmalloc_end - vmalloc_start < last_end) {
+ WARN_ON(true);
+ return NULL;
+ }
+
+ vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
+ vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+ if (!vas || !vms)
+ goto err_free2;
+
+ for (area = 0; area < nr_vms; area++) {
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+ if (!vas[area] || !vms[area])
+ goto err_free;
+ }
+retry:
+ spin_lock(&free_vmap_area_lock);
+
+ /* start scanning - we scan from the top, begin with the last area */
+ area = term_area = last_area;
+ start = offsets[area];
+ end = start + sizes[area];
+
+ va = pvm_find_va_enclose_addr(vmalloc_end);
+ base = pvm_determine_end_from_reverse(&va, align) - end;
+
+ while (true) {
+ /*
+ * base might have underflowed, add last_end before
+ * comparing.
+ */
+ if (base + last_end < vmalloc_start + last_end)
+ goto overflow;
+
+ /*
+ * Fitting base has not been found.
+ */
+ if (va == NULL)
+ goto overflow;
+
+ /*
+ * If required width exceeds current VA block, move
+ * base downwards and then recheck.
+ */
+ if (base + end > va->va_end) {
+ base = pvm_determine_end_from_reverse(&va, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * If this VA does not fit, move base downwards and recheck.
+ */
+ if (base + start < va->va_start) {
+ va = node_to_va(rb_prev(&va->rb_node));
+ base = pvm_determine_end_from_reverse(&va, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * This area fits, move on to the previous one. If
+ * the previous one is the terminal one, we're done.
+ */
+ area = (area + nr_vms - 1) % nr_vms;
+ if (area == term_area)
+ break;
+
+ start = offsets[area];
+ end = start + sizes[area];
+ va = pvm_find_va_enclose_addr(base + end);
+ }
+
+ /* we've found a fitting base, insert all va's */
+ for (area = 0; area < nr_vms; area++) {
+ int ret;
+
+ start = base + offsets[area];
+ size = sizes[area];
+
+ va = pvm_find_va_enclose_addr(start);
+ if (WARN_ON_ONCE(va == NULL))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ type = classify_va_fit_type(va, start, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ ret = adjust_va_to_fit_type(va, start, size, type);
+ if (unlikely(ret))
+ goto recovery;
+
+ /* Allocated area. */
+ va = vas[area];
+ va->va_start = start;
+ va->va_end = start + size;
+ }
+
+ spin_unlock(&free_vmap_area_lock);
+
+ /* populate the kasan shadow space */
+ for (area = 0; area < nr_vms; area++) {
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ goto err_free_shadow;
+
+ kasan_unpoison_vmalloc((void *)vas[area]->va_start,
+ sizes[area]);
+ }
+
+ /* insert all vm's */
+ spin_lock(&vmap_area_lock);
+ for (area = 0; area < nr_vms; area++) {
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ pcpu_get_vm_areas);
+ }
+ spin_unlock(&vmap_area_lock);
+
+ kfree(vas);
+ return vms;
+
+recovery:
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
+ while (area--) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
+ vas[area] = NULL;
+ }
+
+overflow:
+ spin_unlock(&free_vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+
+ /* Before "retry", check if we recover. */
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ continue;
+
+ vas[area] = kmem_cache_zalloc(
+ vmap_area_cachep, GFP_KERNEL);
+ if (!vas[area])
+ goto err_free;
+ }
+
+ goto retry;
+ }
+
+err_free:
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ kmem_cache_free(vmap_area_cachep, vas[area]);
+
+ kfree(vms[area]);
+ }
+err_free2:
+ kfree(vas);
+ kfree(vms);
+ return NULL;
+
+err_free_shadow:
+ spin_lock(&free_vmap_area_lock);
+ /*
+ * We release all the vmalloc shadows, even the ones for regions that
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
+ * being able to tolerate this case.
+ */
+ for (area = 0; area < nr_vms; area++) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
+ vas[area] = NULL;
+ kfree(vms[area]);
+ }
+ spin_unlock(&free_vmap_area_lock);
+ kfree(vas);
+ kfree(vms);
+ return NULL;
+}
+
+/**
+ * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
+ * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
+ * @nr_vms: the number of allocated areas
+ *
+ * Free vm_structs and the array allocated by pcpu_get_vm_areas().
+ */
+void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
+{
+ int i;
+
+ for (i = 0; i < nr_vms; i++)
+ free_vm_area(vms[i]);
+ kfree(vms);
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_purge_lock)
+ __acquires(&vmap_area_lock)
+{
+ mutex_lock(&vmap_purge_lock);
+ spin_lock(&vmap_area_lock);
+
+ return seq_list_start(&vmap_area_list, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &vmap_area_list, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmap_area_lock)
+ __releases(&vmap_purge_lock)
+{
+ spin_unlock(&vmap_area_lock);
+ mutex_unlock(&vmap_purge_lock);
+}
+
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ unsigned int nr, *counters = m->private;
+
+ if (!counters)
+ return;
+
+ if (v->flags & VM_UNINITIALIZED)
+ return;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+
+ for (nr = 0; nr < v->nr_pages; nr++)
+ counters[page_to_nid(v->pages[nr])]++;
+
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
+ }
+}
+
+static void show_purge_info(struct seq_file *m)
+{
+ struct llist_node *head;
+ struct vmap_area *va;
+
+ head = READ_ONCE(vmap_purge_list.first);
+ if (head == NULL)
+ return;
+
+ llist_for_each_entry(va, head, purge_list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct vmap_area *va;
+ struct vm_struct *v;
+
+ va = list_entry(p, struct vmap_area, list);
+
+ /*
+ * s_show can encounter race with remove_vm_area, !vm on behalf
+ * of vmap area is being tear down or vm_map_ram allocation.
+ */
+ if (!va->vm) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+
+ return 0;
+ }
+
+ v = va->vm;
+
+ seq_printf(m, "0x%pK-0x%pK %7ld",
+ v->addr, v->addr + v->size, v->size);
+
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
+
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
+
+ if (v->phys_addr)
+ seq_printf(m, " phys=%pa", &v->phys_addr);
+
+ if (v->flags & VM_IOREMAP)
+ seq_puts(m, " ioremap");
+
+ if (v->flags & VM_ALLOC)
+ seq_puts(m, " vmalloc");
+
+ if (v->flags & VM_MAP)
+ seq_puts(m, " vmap");
+
+ if (v->flags & VM_USERMAP)
+ seq_puts(m, " user");
+
+ if (v->flags & VM_DMA_COHERENT)
+ seq_puts(m, " dma-coherent");
+
+ if (is_vmalloc_addr(v->pages))
+ seq_puts(m, " vpages");
+
+ show_numa_info(m, v);
+ seq_putc(m, '\n');
+
+ /*
+ * As a final step, dump "unpurged" areas. Note,
+ * that entire "/proc/vmallocinfo" output will not
+ * be address sorted, because the purge list is not
+ * sorted.
+ */
+ if (list_is_last(&va->list, &vmap_area_list))
+ show_purge_info(m);
+
+ return 0;
+}
+
+static const struct seq_operations vmalloc_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int __init proc_vmalloc_init(void)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ proc_create_seq_private("vmallocinfo", 0400, NULL,
+ &vmalloc_op,
+ nr_node_ids * sizeof(unsigned int), NULL);
+ else
+ proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
+ return 0;
+}
+module_init(proc_vmalloc_init);
+
+#endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000..d69019fc3
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ * Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0 : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+ return container_of(work, struct vmpressure, work);
+}
+
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+ struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return memcg_to_vmpressure(memcg);
+}
+
+enum vmpressure_levels {
+ VMPRESSURE_LOW = 0,
+ VMPRESSURE_MEDIUM,
+ VMPRESSURE_CRITICAL,
+ VMPRESSURE_NUM_LEVELS,
+};
+
+enum vmpressure_modes {
+ VMPRESSURE_NO_PASSTHROUGH = 0,
+ VMPRESSURE_HIERARCHY,
+ VMPRESSURE_LOCAL,
+ VMPRESSURE_NUM_MODES,
+};
+
+static const char * const vmpressure_str_levels[] = {
+ [VMPRESSURE_LOW] = "low",
+ [VMPRESSURE_MEDIUM] = "medium",
+ [VMPRESSURE_CRITICAL] = "critical",
+};
+
+static const char * const vmpressure_str_modes[] = {
+ [VMPRESSURE_NO_PASSTHROUGH] = "default",
+ [VMPRESSURE_HIERARCHY] = "hierarchy",
+ [VMPRESSURE_LOCAL] = "local",
+};
+
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+ if (pressure >= vmpressure_level_critical)
+ return VMPRESSURE_CRITICAL;
+ else if (pressure >= vmpressure_level_med)
+ return VMPRESSURE_MEDIUM;
+ return VMPRESSURE_LOW;
+}
+
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+ unsigned long reclaimed)
+{
+ unsigned long scale = scanned + reclaimed;
+ unsigned long pressure = 0;
+
+ /*
+ * reclaimed can be greater than scanned for things such as reclaimed
+ * slab pages. shrink_node() just adds reclaimed pages without a
+ * related increment to scanned pages.
+ */
+ if (reclaimed >= scanned)
+ goto out;
+ /*
+ * We calculate the ratio (in percents) of how many pages were
+ * scanned vs. reclaimed in a given time frame (window). Note that
+ * time is in VM reclaimer's "ticks", i.e. number of pages
+ * scanned. This makes it possible to set desired reaction time
+ * and serves as a ratelimit.
+ */
+ pressure = scale - (reclaimed * scale / scanned);
+ pressure = pressure * 100 / scale;
+
+out:
+ pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
+ scanned, reclaimed);
+
+ return vmpressure_level(pressure);
+}
+
+struct vmpressure_event {
+ struct eventfd_ctx *efd;
+ enum vmpressure_levels level;
+ enum vmpressure_modes mode;
+ struct list_head node;
+};
+
+static bool vmpressure_event(struct vmpressure *vmpr,
+ const enum vmpressure_levels level,
+ bool ancestor, bool signalled)
+{
+ struct vmpressure_event *ev;
+ bool ret = false;
+
+ mutex_lock(&vmpr->events_lock);
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (ancestor && ev->mode == VMPRESSURE_LOCAL)
+ continue;
+ if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
+ continue;
+ if (level < ev->level)
+ continue;
+ eventfd_signal(ev->efd, 1);
+ ret = true;
+ }
+ mutex_unlock(&vmpr->events_lock);
+
+ return ret;
+}
+
+static void vmpressure_work_fn(struct work_struct *work)
+{
+ struct vmpressure *vmpr = work_to_vmpressure(work);
+ unsigned long scanned;
+ unsigned long reclaimed;
+ enum vmpressure_levels level;
+ bool ancestor = false;
+ bool signalled = false;
+
+ spin_lock(&vmpr->sr_lock);
+ /*
+ * Several contexts might be calling vmpressure(), so it is
+ * possible that the work was rescheduled again before the old
+ * work context cleared the counters. In that case we will run
+ * just after the old work returns, but then scanned might be zero
+ * here. No need for any locks here since we don't care if
+ * vmpr->reclaimed is in sync.
+ */
+ scanned = vmpr->tree_scanned;
+ if (!scanned) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ do {
+ if (vmpressure_event(vmpr, level, ancestor, signalled))
+ signalled = true;
+ ancestor = true;
+ } while ((vmpr = vmpressure_parent(vmpr)));
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
+ * @scanned: number of pages scanned
+ * @reclaimed: number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+
+ /*
+ * Here we only want to account pressure that userland is able to
+ * help us with. For example, suppose that DMA zone is under
+ * pressure; if we notify userland about that kind of pressure,
+ * then it will be mostly a waste as it will trigger unnecessary
+ * freeing of memory by userland (since userland is more likely to
+ * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+ * is why we include only movable, highmem and FS/IO pages.
+ * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+ * we account it too.
+ */
+ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+ return;
+
+ /*
+ * If we got here with no pages scanned, then that is an indicator
+ * that reclaimer was unable to find any shrinkable LRUs at the
+ * current scanning depth. But it does not mean that we should
+ * report the critical pressure, yet. If the scanning priority
+ * (scanning depth) goes too high (deep), we will be notified
+ * through vmpressure_prio(). But so far, keep calm.
+ */
+ if (!scanned)
+ return;
+
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ spin_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (!memcg || mem_cgroup_is_root(memcg))
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
+}
+
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @prio: reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+ /*
+ * We only use prio for accounting critical level. For more info
+ * see comment for vmpressure_level_critical_prio variable above.
+ */
+ if (prio > vmpressure_level_critical_prio)
+ return;
+
+ /*
+ * OK, the prio is below the threshold, updating vmpressure
+ * information before shrinker dives into long shrinking of long
+ * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+ * to the vmpressure() basically means that we signal 'critical'
+ * level.
+ */
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
+}
+
+#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
+
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @memcg: memcg that is interested in vmpressure notifications
+ * @eventfd: eventfd context to link notifications with
+ * @args: event arguments (pressure level threshold, optional mode)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a comma-delimited string that denotes a
+ * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
+ * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
+ * "hierarchy" or "local").
+ *
+ * To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
+ */
+int vmpressure_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure_event *ev;
+ enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
+ enum vmpressure_levels level;
+ char *spec, *spec_orig;
+ char *token;
+ int ret = 0;
+
+ spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
+ if (!spec)
+ return -ENOMEM;
+
+ /* Find required level */
+ token = strsep(&spec, ",");
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
+ goto out;
+ level = ret;
+
+ /* Find optional mode */
+ token = strsep(&spec, ",");
+ if (token) {
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
+ goto out;
+ mode = ret;
+ }
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ev->efd = eventfd;
+ ev->level = level;
+ ev->mode = mode;
+
+ mutex_lock(&vmpr->events_lock);
+ list_add(&ev->node, &vmpr->events);
+ mutex_unlock(&vmpr->events_lock);
+ ret = 0;
+out:
+ kfree(spec_orig);
+ return ret;
+}
+
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @memcg: memcg handle
+ * @eventfd: eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * To be used as memcg event method.
+ */
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure_event *ev;
+
+ mutex_lock(&vmpr->events_lock);
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (ev->efd != eventfd)
+ continue;
+ list_del(&ev->node);
+ kfree(ev);
+ break;
+ }
+ mutex_unlock(&vmpr->events_lock);
+}
+
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr: Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+ spin_lock_init(&vmpr->sr_lock);
+ mutex_init(&vmpr->events_lock);
+ INIT_LIST_HEAD(&vmpr->events);
+ INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
+
+/**
+ * vmpressure_cleanup() - shuts down vmpressure control structure
+ * @vmpr: Structure to be cleaned up
+ *
+ * This function should be called before the structure in which it is
+ * embedded is cleaned up.
+ */
+void vmpressure_cleanup(struct vmpressure *vmpr)
+{
+ /*
+ * Make sure there is no pending work before eventfd infrastructure
+ * goes away.
+ */
+ flush_work(&vmpr->work);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
new file mode 100644
index 000000000..51ccd80e7
--- /dev/null
+++ b/mm/vmscan.c
@@ -0,0 +1,4322 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/mm/vmscan.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * Swap reorganised 29.12.95, Stephen Tweedie.
+ * kswapd added: 7.1.96 sct
+ * Removed kswapd_ctl limits, and swap out as many pages as needed
+ * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
+ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ * Multiqueue VM started 5.8.00, Rik van Riel.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/kernel_stat.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/vmpressure.h>
+#include <linux/vmstat.h>
+#include <linux/file.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h> /* for try_to_release_page(),
+ buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/backing-dev.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/compaction.h>
+#include <linux/notifier.h>
+#include <linux/rwsem.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/memcontrol.h>
+#include <linux/delayacct.h>
+#include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/printk.h>
+#include <linux/dax.h>
+#include <linux/psi.h>
+
+#include <asm/tlbflush.h>
+#include <asm/div64.h>
+
+#include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
+
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
+struct scan_control {
+ /* How many pages shrink_list() should reclaim */
+ unsigned long nr_to_reclaim;
+
+ /*
+ * Nodemask of nodes allowed by the caller. If NULL, all nodes
+ * are scanned.
+ */
+ nodemask_t *nodemask;
+
+ /*
+ * The memory cgroup that hit its limit and as a result is the
+ * primary target of this reclaim invocation.
+ */
+ struct mem_cgroup *target_mem_cgroup;
+
+ /*
+ * Scan pressure balancing between anon and file LRUs
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
+
+ /* Can active pages be deactivated as part of reclaim? */
+#define DEACTIVATE_ANON 1
+#define DEACTIVATE_FILE 2
+ unsigned int may_deactivate:2;
+ unsigned int force_deactivate:1;
+ unsigned int skipped_deactivate:1;
+
+ /* Writepage batching in laptop mode; RECLAIM_WRITE */
+ unsigned int may_writepage:1;
+
+ /* Can mapped pages be reclaimed? */
+ unsigned int may_unmap:1;
+
+ /* Can pages be swapped as part of reclaim? */
+ unsigned int may_swap:1;
+
+ /*
+ * Cgroup memory below memory.low is protected as long as we
+ * don't threaten to OOM. If any cgroup is reclaimed at
+ * reduced force or passed over entirely due to its memory.low
+ * setting (memcg_low_skipped), and nothing is reclaimed as a
+ * result, then go back for one more cycle that reclaims the protected
+ * memory (memcg_low_reclaim) to avert OOM.
+ */
+ unsigned int memcg_low_reclaim:1;
+ unsigned int memcg_low_skipped:1;
+
+ unsigned int hibernation_mode:1;
+
+ /* One of the zones is ready for compaction */
+ unsigned int compaction_ready:1;
+
+ /* There is easily reclaimable cold cache in the current node */
+ unsigned int cache_trim_mode:1;
+
+ /* The file pages on the current node are dangerously low */
+ unsigned int file_is_tiny:1;
+
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
+ /* Incremented by the number of inactive pages that were scanned */
+ unsigned long nr_scanned;
+
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
+
+ struct {
+ unsigned int dirty;
+ unsigned int unqueued_dirty;
+ unsigned int congested;
+ unsigned int writeback;
+ unsigned int immediate;
+ unsigned int file_taken;
+ unsigned int taken;
+ } nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
+};
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = lru_to_page(&(_page->lru)); \
+ prefetchw(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * From 0 .. 200. Higher means more swappy.
+ */
+int vm_swappiness = 60;
+
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
+static LIST_HEAD(shrinker_list);
+static DECLARE_RWSEM(shrinker_rwsem);
+
+#ifdef CONFIG_MEMCG
+/*
+ * We allow subsystems to populate their shrinker-related
+ * LRU lists before register_shrinker_prepared() is called
+ * for the shrinker, since we don't want to impose
+ * restrictions on their internal registration order.
+ * In this case shrink_slab_memcg() may find corresponding
+ * bit is set in the shrinkers map.
+ *
+ * This value is used by the function to detect registering
+ * shrinkers and to skip do_shrink_slab() calls for them.
+ */
+#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+
+static DEFINE_IDR(shrinker_idr);
+static int shrinker_nr_max;
+
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
+ id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (memcg_expand_shrinker_maps(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+
+ shrinker_nr_max = id + 1;
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ up_write(&shrinker_rwsem);
+ return ret;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ down_write(&shrinker_rwsem);
+ idr_remove(&shrinker_idr, id);
+ up_write(&shrinker_rwsem);
+}
+
+static bool cgroup_reclaim(struct scan_control *sc)
+{
+ return sc->target_mem_cgroup;
+}
+
+/**
+ * writeback_throttling_sane - is the usual dirty throttling mechanism available?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool writeback_throttling_sane(struct scan_control *sc)
+{
+ if (!cgroup_reclaim(sc))
+ return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return true;
+#endif
+ return false;
+}
+#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
+static bool cgroup_reclaim(struct scan_control *sc)
+{
+ return false;
+}
+
+static bool writeback_throttling_sane(struct scan_control *sc)
+{
+ return true;
+}
+#endif
+
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ unsigned long nr;
+
+ nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+
+ return nr;
+}
+
+/**
+ * lruvec_lru_size - Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+{
+ unsigned long size = 0;
+ int zid;
+
+ for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+
+ if (!managed_zone(zone))
+ continue;
+
+ if (!mem_cgroup_disabled())
+ size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ else
+ size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
+ }
+ return size;
+}
+
+/*
+ * Add a shrinker callback to be called from the vm.
+ */
+int prealloc_shrinker(struct shrinker *shrinker)
+{
+ unsigned int size = sizeof(*shrinker->nr_deferred);
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ if (prealloc_memcg_shrinker(shrinker))
+ goto free_deferred;
+ }
+
+ return 0;
+
+free_deferred:
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+ return -ENOMEM;
+}
+
+void free_prealloced_shrinker(struct shrinker *shrinker)
+{
+ if (!shrinker->nr_deferred)
+ return;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+}
+
+void register_shrinker_prepared(struct shrinker *shrinker)
+{
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
+#ifdef CONFIG_MEMCG
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
+#endif
+ up_write(&shrinker_rwsem);
+}
+
+int register_shrinker(struct shrinker *shrinker)
+{
+ int err = prealloc_shrinker(shrinker);
+
+ if (err)
+ return err;
+ register_shrinker_prepared(shrinker);
+ return 0;
+}
+EXPORT_SYMBOL(register_shrinker);
+
+/*
+ * Remove one
+ */
+void unregister_shrinker(struct shrinker *shrinker)
+{
+ if (!shrinker->nr_deferred)
+ return;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+ down_write(&shrinker_rwsem);
+ list_del(&shrinker->list);
+ up_write(&shrinker_rwsem);
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+}
+EXPORT_SYMBOL(unregister_shrinker);
+
+#define SHRINK_BATCH 128
+
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker, int priority)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long freeable;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+ long scanned = 0, next_deferred;
+
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
+
+ total_scan += delta;
+ if (total_scan < 0) {
+ pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+ total_scan = freeable;
+ next_deferred = nr;
+ } else
+ next_deferred = total_scan;
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * freeable. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ freeable, delta, total_scan, priority);
+
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
+ unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
+
+ shrinkctl->nr_to_scan = nr_to_scan;
+ shrinkctl->nr_scanned = nr_to_scan;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+ count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+ total_scan -= shrinkctl->nr_scanned;
+ scanned += shrinkctl->nr_scanned;
+
+ cond_resched();
+ }
+
+ if (next_deferred >= scanned)
+ next_deferred -= scanned;
+ else
+ next_deferred = 0;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (next_deferred > 0)
+ new_nr = atomic_long_add_return(next_deferred,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ return freed;
+}
+
+#ifdef CONFIG_MEMCG
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct memcg_shrinker_map *map;
+ unsigned long ret, freed = 0;
+ int i;
+
+ if (!mem_cgroup_online(memcg))
+ return 0;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
+ true);
+ if (unlikely(!map))
+ goto unlock;
+
+ for_each_set_bit(i, map->map, shrinker_nr_max) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+
+ shrinker = idr_find(&shrinker_idr, i);
+ if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (!shrinker)
+ clear_bit(i, map->map);
+ continue;
+ }
+
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(i, map->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * memcg_set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ memcg_set_shrinker_bit(memcg, nid, i);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+unlock:
+ up_read(&shrinker_rwsem);
+ return freed;
+}
+#else /* CONFIG_MEMCG */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG */
+
+/**
+ * shrink_slab - shrink slab caches
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
+ * @priority: the reclaim priority
+ *
+ * Call the shrink functions to age shrinkable caches.
+ *
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
+ *
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
+ *
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
+ *
+ * Returns the number of reclaimed slab objects.
+ */
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg,
+ int priority)
+{
+ unsigned long ret, freed = 0;
+ struct shrinker *shrinker;
+
+ /*
+ * The root memcg might be allocated even though memcg is disabled
+ * via "cgroup_disable=memory" boot parameter. This could make
+ * mem_cgroup_is_root() return false, then just run memcg slab
+ * shrink, but skip global shrink. This may result in premature
+ * oom.
+ */
+ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ goto out;
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the registration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+
+ up_read(&shrinker_rwsem);
+out:
+ cond_resched();
+ return freed;
+}
+
+void drop_slab_node(int nid)
+{
+ unsigned long freed;
+
+ do {
+ struct mem_cgroup *memcg = NULL;
+
+ if (fatal_signal_pending(current))
+ return;
+
+ freed = 0;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ } while (freed > 10);
+}
+
+void drop_slab(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ drop_slab_node(nid);
+}
+
+static inline int is_page_cache_freeable(struct page *page)
+{
+ /*
+ * A freeable page cache page is referenced only by the caller
+ * that isolated the page, the page cache and optional buffer
+ * heads at page->private.
+ */
+ int page_cache_pins = thp_nr_pages(page);
+ return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
+}
+
+static int may_write_to_inode(struct inode *inode)
+{
+ if (current->flags & PF_SWAPWRITE)
+ return 1;
+ if (!inode_write_congested(inode))
+ return 1;
+ if (inode_to_bdi(inode) == current->backing_dev_info)
+ return 1;
+ return 0;
+}
+
+/*
+ * We detected a synchronous write error writing a page out. Probably
+ * -ENOSPC. We need to propagate that into the address_space for a subsequent
+ * fsync(), msync() or close().
+ *
+ * The tricky part is that after writepage we cannot touch the mapping: nothing
+ * prevents it from being freed up. But we have a ref on the page and once
+ * that page is locked, the mapping is pinned.
+ *
+ * We're allowed to run sleeping lock_page() here because we know the caller has
+ * __GFP_FS.
+ */
+static void handle_write_error(struct address_space *mapping,
+ struct page *page, int error)
+{
+ lock_page(page);
+ if (page_mapping(page) == mapping)
+ mapping_set_error(mapping, error);
+ unlock_page(page);
+}
+
+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN,
+} pageout_t;
+
+/*
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
+ */
+static pageout_t pageout(struct page *page, struct address_space *mapping)
+{
+ /*
+ * If the page is dirty, only perform writeback if that write
+ * will be non-blocking. To prevent this allocation from being
+ * stalled by pagecache activity. But note that there may be
+ * stalls if we need to run get_block(). We could test
+ * PagePrivate for that.
+ *
+ * If this process is currently in __generic_file_write_iter() against
+ * this page's queue, we can perform writeback even if that
+ * will block.
+ *
+ * If the page is swapcache, write it back even if that would
+ * block, for some throttling. This happens by accident, because
+ * swap_backing_dev_info is bust: it doesn't reflect the
+ * congestion state of the swapdevs. Easy to fix, if needed.
+ */
+ if (!is_page_cache_freeable(page))
+ return PAGE_KEEP;
+ if (!mapping) {
+ /*
+ * Some data journaling orphaned pages can have
+ * page->mapping == NULL while being dirty with clean buffers.
+ */
+ if (page_has_private(page)) {
+ if (try_to_free_buffers(page)) {
+ ClearPageDirty(page);
+ pr_info("%s: orphaned page\n", __func__);
+ return PAGE_CLEAN;
+ }
+ }
+ return PAGE_KEEP;
+ }
+ if (mapping->a_ops->writepage == NULL)
+ return PAGE_ACTIVATE;
+ if (!may_write_to_inode(mapping->host))
+ return PAGE_KEEP;
+
+ if (clear_page_dirty_for_io(page)) {
+ int res;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = SWAP_CLUSTER_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .for_reclaim = 1,
+ };
+
+ SetPageReclaim(page);
+ res = mapping->a_ops->writepage(page, &wbc);
+ if (res < 0)
+ handle_write_error(mapping, page, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ ClearPageReclaim(page);
+ return PAGE_ACTIVATE;
+ }
+
+ if (!PageWriteback(page)) {
+ /* synchronous write or broken a_ops? */
+ ClearPageReclaim(page);
+ }
+ trace_mm_vmscan_writepage(page);
+ inc_node_page_state(page, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
+ }
+
+ return PAGE_CLEAN;
+}
+
+/*
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
+ */
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+ bool reclaimed, struct mem_cgroup *target_memcg)
+{
+ unsigned long flags;
+ int refcount;
+ void *shadow = NULL;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(mapping != page_mapping(page));
+
+ xa_lock_irqsave(&mapping->i_pages, flags);
+ /*
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_refcount.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under the i_pages lock, then this ordering is not required.
+ */
+ refcount = 1 + compound_nr(page);
+ if (!page_ref_freeze(page, refcount))
+ goto cannot_free;
+ /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
+ if (unlikely(PageDirty(page))) {
+ page_ref_unfreeze(page, refcount);
+ goto cannot_free;
+ }
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ mem_cgroup_swapout(page, swap);
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
+ __delete_from_swap_cache(page, swap, shadow);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+ put_swap_page(page, swap);
+ } else {
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+ /*
+ * Remember a shadow entry for reclaimed file cache in
+ * order to detect refaults, thus thrashing, later on.
+ *
+ * But don't store shadows in an address space that is
+ * already exiting. This is not just an optimization,
+ * inode reclaim needs to empty out the radix tree or
+ * the nodes are lost. Don't plant shadows behind its
+ * back.
+ *
+ * We also don't store shadows for DAX mappings because the
+ * only page cache pages found in these are zero pages
+ * covering holes, and because we don't want to mix DAX
+ * exceptional entries and shadow exceptional entries in the
+ * same address_space.
+ */
+ if (reclaimed && page_is_file_lru(page) &&
+ !mapping_exiting(mapping) && !dax_mapping(mapping))
+ shadow = workingset_eviction(page, target_memcg);
+ __delete_from_page_cache(page, shadow);
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+
+ if (freepage != NULL)
+ freepage(page);
+ }
+
+ return 1;
+
+cannot_free:
+ xa_unlock_irqrestore(&mapping->i_pages, flags);
+ return 0;
+}
+
+/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (__remove_mapping(mapping, page, false, NULL)) {
+ /*
+ * Unfreezing the refcount with 1 rather than 2 effectively
+ * drops the pagecache ref for us without requiring another
+ * atomic operation.
+ */
+ page_ref_unfreeze(page, 1);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+void putback_lru_page(struct page *page)
+{
+ lru_cache_add(page);
+ put_page(page); /* drop ref from isolate */
+}
+
+enum page_references {
+ PAGEREF_RECLAIM,
+ PAGEREF_RECLAIM_CLEAN,
+ PAGEREF_KEEP,
+ PAGEREF_ACTIVATE,
+};
+
+static enum page_references page_check_references(struct page *page,
+ struct scan_control *sc)
+{
+ int referenced_ptes, referenced_page;
+ unsigned long vm_flags;
+
+ referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+ &vm_flags);
+ referenced_page = TestClearPageReferenced(page);
+
+ /*
+ * Mlock lost the isolation race with us. Let try_to_unmap()
+ * move the page to the unevictable list.
+ */
+ if (vm_flags & VM_LOCKED)
+ return PAGEREF_RECLAIM;
+
+ if (referenced_ptes) {
+ /*
+ * All mapped pages start out with page table
+ * references from the instantiating fault, so we need
+ * to look twice if a mapped file page is used more
+ * than once.
+ *
+ * Mark it and spare it for another trip around the
+ * inactive list. Another page table reference will
+ * lead to its activation.
+ *
+ * Note: the mark is set for activated pages as well
+ * so that recently deactivated but used pages are
+ * quickly recovered.
+ */
+ SetPageReferenced(page);
+
+ if (referenced_page || referenced_ptes > 1)
+ return PAGEREF_ACTIVATE;
+
+ /*
+ * Activate file-backed executable pages after first usage.
+ */
+ if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
+ return PAGEREF_ACTIVATE;
+
+ return PAGEREF_KEEP;
+ }
+
+ /* Reclaim if clean, defer dirty pages to writeback */
+ if (referenced_page && !PageSwapBacked(page))
+ return PAGEREF_RECLAIM_CLEAN;
+
+ return PAGEREF_RECLAIM;
+}
+
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct address_space *mapping;
+
+ /*
+ * Anonymous pages are not handled by flushers and must be written
+ * from reclaim context. Do not stall reclaim based on them
+ */
+ if (!page_is_file_lru(page) ||
+ (PageAnon(page) && !PageSwapBacked(page))) {
+ *dirty = false;
+ *writeback = false;
+ return;
+ }
+
+ /* By default assume that the page flags are accurate */
+ *dirty = PageDirty(page);
+ *writeback = PageWriteback(page);
+
+ /* Verify dirty/writeback state if the filesystem supports it */
+ if (!page_has_private(page))
+ return;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops->is_dirty_writeback)
+ mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
+
+/*
+ * shrink_page_list() returns the number of reclaimed pages
+ */
+static unsigned int shrink_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat,
+ struct scan_control *sc,
+ struct reclaim_stat *stat,
+ bool ignore_references)
+{
+ LIST_HEAD(ret_pages);
+ LIST_HEAD(free_pages);
+ unsigned int nr_reclaimed = 0;
+ unsigned int pgactivate = 0;
+
+ memset(stat, 0, sizeof(*stat));
+ cond_resched();
+
+ while (!list_empty(page_list)) {
+ struct address_space *mapping;
+ struct page *page;
+ enum page_references references = PAGEREF_RECLAIM;
+ bool dirty, writeback, may_enter_fs;
+ unsigned int nr_pages;
+
+ cond_resched();
+
+ page = lru_to_page(page_list);
+ list_del(&page->lru);
+
+ if (!trylock_page(page))
+ goto keep;
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+
+ nr_pages = compound_nr(page);
+
+ /* Account the number of base pages even though THP */
+ sc->nr_scanned += nr_pages;
+
+ if (unlikely(!page_evictable(page)))
+ goto activate_locked;
+
+ if (!sc->may_unmap && page_mapped(page))
+ goto keep_locked;
+
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+ /*
+ * The number of dirty pages determines if a node is marked
+ * reclaim_congested which affects wait_iff_congested. kswapd
+ * will stall and start writing pages if the tail of the LRU
+ * is all dirty unqueued pages.
+ */
+ page_check_dirty_writeback(page, &dirty, &writeback);
+ if (dirty || writeback)
+ stat->nr_dirty++;
+
+ if (dirty && !writeback)
+ stat->nr_unqueued_dirty++;
+
+ /*
+ * Treat this page as congested if the underlying BDI is or if
+ * pages are cycling through the LRU so quickly that the
+ * pages marked for immediate reclaim are making it to the
+ * end of the LRU a second time.
+ */
+ mapping = page_mapping(page);
+ if (((dirty || writeback) && mapping &&
+ inode_write_congested(mapping->host)) ||
+ (writeback && PageReclaim(page)))
+ stat->nr_congested++;
+
+ /*
+ * If a page at the tail of the LRU is under writeback, there
+ * are three cases to consider.
+ *
+ * 1) If reclaim is encountering an excessive number of pages
+ * under writeback and this page is both under writeback and
+ * PageReclaim then it indicates that pages are being queued
+ * for IO but are being recycled through the LRU before the
+ * IO can complete. Waiting on the page itself risks an
+ * indefinite stall if it is impossible to writeback the
+ * page due to IO error or disconnected storage so instead
+ * note that the LRU is being scanned too quickly and the
+ * caller can stall after page list has been processed.
+ *
+ * 2) Global or new memcg reclaim encounters a page that is
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
+ * reclaim and continue scanning.
+ *
+ * Require may_enter_fs because we would wait on fs, which
+ * may not have submitted IO yet. And the loop driver might
+ * enter reclaim, and deadlock if it waits on a page for
+ * which it is needed to do the write (loop masks off
+ * __GFP_IO|__GFP_FS for this reason); but more thought
+ * would probably show more reasons.
+ *
+ * 3) Legacy memcg encounters a page that is already marked
+ * PageReclaim. memcg does not have any dirty pages
+ * throttling so we could easily OOM just because too many
+ * pages are in writeback and there is nothing else to
+ * reclaim. Wait for the writeback to complete.
+ *
+ * In cases 1) and 2) we activate the pages to get them out of
+ * the way while we continue scanning for clean pages on the
+ * inactive list and refilling from the active list. The
+ * observation here is that waiting for disk writes is more
+ * expensive than potentially causing reloads down the line.
+ * Since they're marked for immediate reclaim, they won't put
+ * memory pressure on the cache working set any longer than it
+ * takes to write them to disk.
+ */
+ if (PageWriteback(page)) {
+ /* Case 1 above */
+ if (current_is_kswapd() &&
+ PageReclaim(page) &&
+ test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
+ stat->nr_immediate++;
+ goto activate_locked;
+
+ /* Case 2 above */
+ } else if (writeback_throttling_sane(sc) ||
+ !PageReclaim(page) || !may_enter_fs) {
+ /*
+ * This is slightly racy - end_page_writeback()
+ * might have just cleared PageReclaim, then
+ * setting PageReclaim here end up interpreted
+ * as PageReadahead - but that does not matter
+ * enough to care. What we do want is for this
+ * page to have PageReclaim set next time memcg
+ * reclaim reaches the tests above, so it will
+ * then wait_on_page_writeback() to avoid OOM;
+ * and it's also appropriate in global reclaim.
+ */
+ SetPageReclaim(page);
+ stat->nr_writeback++;
+ goto activate_locked;
+
+ /* Case 3 above */
+ } else {
+ unlock_page(page);
+ wait_on_page_writeback(page);
+ /* then go back and try same page again */
+ list_add_tail(&page->lru, page_list);
+ continue;
+ }
+ }
+
+ if (!ignore_references)
+ references = page_check_references(page, sc);
+
+ switch (references) {
+ case PAGEREF_ACTIVATE:
+ goto activate_locked;
+ case PAGEREF_KEEP:
+ stat->nr_ref_keep += nr_pages;
+ goto keep_locked;
+ case PAGEREF_RECLAIM:
+ case PAGEREF_RECLAIM_CLEAN:
+ ; /* try to reclaim the page below */
+ }
+
+ /*
+ * Anonymous process memory has backing store?
+ * Try to allocate it some swap space here.
+ * Lazyfree page could be freed directly
+ */
+ if (PageAnon(page) && PageSwapBacked(page)) {
+ if (!PageSwapCache(page)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (page_maybe_dma_pinned(page))
+ goto keep_locked;
+ if (PageTransHuge(page)) {
+ /* cannot split THP, skip it */
+ if (!can_split_huge_page(page, NULL))
+ goto activate_locked;
+ /*
+ * Split pages without a PMD map right
+ * away. Chances are some or all of the
+ * tail pages can be freed without IO.
+ */
+ if (!compound_mapcount(page) &&
+ split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(page)) {
+ if (!PageTransHuge(page))
+ goto activate_locked_split;
+ /* Fallback to swap normal pages */
+ if (split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
+ if (!add_to_swap(page))
+ goto activate_locked_split;
+ }
+
+ may_enter_fs = true;
+
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ }
+ } else if (unlikely(PageTransHuge(page))) {
+ /* Split file THP */
+ if (split_huge_page_to_list(page, page_list))
+ goto keep_locked;
+ }
+
+ /*
+ * THP may get split above, need minus tail pages and update
+ * nr_pages to avoid accounting tail pages twice.
+ *
+ * The tail pages that are added into swap cache successfully
+ * reach here.
+ */
+ if ((nr_pages > 1) && !PageTransHuge(page)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
+
+ /*
+ * The page is mapped into the page tables of one or more
+ * processes. Try to unmap it here.
+ */
+ if (page_mapped(page)) {
+ enum ttu_flags flags = TTU_BATCH_FLUSH;
+ bool was_swapbacked = PageSwapBacked(page);
+
+ if (unlikely(PageTransHuge(page)))
+ flags |= TTU_SPLIT_HUGE_PMD;
+
+ if (!try_to_unmap(page, flags)) {
+ stat->nr_unmap_fail += nr_pages;
+ if (!was_swapbacked && PageSwapBacked(page))
+ stat->nr_lazyfree_fail += nr_pages;
+ goto activate_locked;
+ }
+ }
+
+ if (PageDirty(page)) {
+ /*
+ * Only kswapd can writeback filesystem pages
+ * to avoid risk of stack overflow. But avoid
+ * injecting inefficient single-page IO into
+ * flusher writeback as much as possible: only
+ * write pages when we've encountered many
+ * dirty pages, and when we've already scanned
+ * the rest of the LRU for clean pages and see
+ * the same dirty pages again (PageReclaim).
+ */
+ if (page_is_file_lru(page) &&
+ (!current_is_kswapd() || !PageReclaim(page) ||
+ !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+ /*
+ * Immediately reclaim when written back.
+ * Similar in principal to deactivate_page()
+ * except we already have the page isolated
+ * and know it's dirty
+ */
+ inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
+
+ goto activate_locked;
+ }
+
+ if (references == PAGEREF_RECLAIM_CLEAN)
+ goto keep_locked;
+ if (!may_enter_fs)
+ goto keep_locked;
+ if (!sc->may_writepage)
+ goto keep_locked;
+
+ /*
+ * Page is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after IO
+ * starts and then write it out here.
+ */
+ try_to_unmap_flush_dirty();
+ switch (pageout(page, mapping)) {
+ case PAGE_KEEP:
+ goto keep_locked;
+ case PAGE_ACTIVATE:
+ goto activate_locked;
+ case PAGE_SUCCESS:
+ stat->nr_pageout += thp_nr_pages(page);
+
+ if (PageWriteback(page))
+ goto keep;
+ if (PageDirty(page))
+ goto keep;
+
+ /*
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the page.
+ */
+ if (!trylock_page(page))
+ goto keep;
+ if (PageDirty(page) || PageWriteback(page))
+ goto keep_locked;
+ mapping = page_mapping(page);
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
+ }
+ }
+
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we try to free
+ * the page as well.
+ *
+ * We do this even if the page is PageDirty().
+ * try_to_release_page() does not perform I/O, but it is
+ * possible for a page to have PageDirty set, but it is actually
+ * clean (all its buffers are clean). This happens if the
+ * buffers were written out directly, with submit_bh(). ext3
+ * will do this, as well as the blockdev mapping.
+ * try_to_release_page() will discover that cleanness and will
+ * drop the buffers and mark the page clean - it can be freed.
+ *
+ * Rarely, pages can have buffers and no ->mapping. These are
+ * the pages which were not successfully invalidated in
+ * truncate_complete_page(). We try to drop those buffers here
+ * and if that worked, and the page is no longer mapped into
+ * process address space (page_count == 1) it can be freed.
+ * Otherwise, leave the page on the LRU so it is swappable.
+ */
+ if (page_has_private(page)) {
+ if (!try_to_release_page(page, sc->gfp_mask))
+ goto activate_locked;
+ if (!mapping && page_count(page) == 1) {
+ unlock_page(page);
+ if (put_page_testzero(page))
+ goto free_it;
+ else {
+ /*
+ * rare race with speculative reference.
+ * the speculative reference will free
+ * this page shortly, so we may
+ * increment nr_reclaimed here (and
+ * leave it off the LRU).
+ */
+ nr_reclaimed++;
+ continue;
+ }
+ }
+ }
+
+ if (PageAnon(page) && !PageSwapBacked(page)) {
+ /* follow __remove_mapping for reference */
+ if (!page_ref_freeze(page, 1))
+ goto keep_locked;
+ if (PageDirty(page)) {
+ page_ref_unfreeze(page, 1);
+ goto keep_locked;
+ }
+
+ count_vm_event(PGLAZYFREED);
+ count_memcg_page_event(page, PGLAZYFREED);
+ } else if (!mapping || !__remove_mapping(mapping, page, true,
+ sc->target_mem_cgroup))
+ goto keep_locked;
+
+ unlock_page(page);
+free_it:
+ /*
+ * THP may get swapped out in a whole, need account
+ * all base pages.
+ */
+ nr_reclaimed += nr_pages;
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ if (unlikely(PageTransHuge(page)))
+ destroy_compound_page(page);
+ else
+ list_add(&page->lru, &free_pages);
+ continue;
+
+activate_locked_split:
+ /*
+ * The tail pages that are failed to add into swap cache
+ * reach here. Fixup nr_scanned and nr_pages.
+ */
+ if (nr_pages > 1) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
+activate_locked:
+ /* Not a candidate for swapping, so reclaim swap space. */
+ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+ PageMlocked(page)))
+ try_to_free_swap(page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ if (!PageMlocked(page)) {
+ int type = page_is_file_lru(page);
+ SetPageActive(page);
+ stat->nr_activate[type] += nr_pages;
+ count_memcg_page_event(page, PGACTIVATE);
+ }
+keep_locked:
+ unlock_page(page);
+keep:
+ list_add(&page->lru, &ret_pages);
+ VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
+ }
+
+ pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
+ mem_cgroup_uncharge_list(&free_pages);
+ try_to_unmap_flush();
+ free_unref_page_list(&free_pages);
+
+ list_splice(&ret_pages, page_list);
+ count_vm_events(PGACTIVATE, pgactivate);
+
+ return nr_reclaimed;
+}
+
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list)
+{
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
+ };
+ struct reclaim_stat stat;
+ unsigned int nr_reclaimed;
+ struct page *page, *next;
+ LIST_HEAD(clean_pages);
+
+ list_for_each_entry_safe(page, next, page_list, lru) {
+ if (page_is_file_lru(page) && !PageDirty(page) &&
+ !__PageMovable(page) && !PageUnevictable(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &clean_pages);
+ }
+ }
+
+ nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ &stat, true);
+ list_splice(&clean_pages, page_list);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -(long)nr_reclaimed);
+ /*
+ * Since lazyfree pages are isolated from file LRU from the beginning,
+ * they will rotate back to anonymous LRU in the end if it failed to
+ * discard so isolated count will be mismatched.
+ * Compensate the isolated count for both LRU lists.
+ */
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
+ stat.nr_lazyfree_fail);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -(long)stat.nr_lazyfree_fail);
+ return nr_reclaimed;
+}
+
+/*
+ * Attempt to remove the specified page from its LRU. Only take this page
+ * if it is of the appropriate PageActive status. Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page: page to consider
+ * mode: one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
+{
+ int ret = -EINVAL;
+
+ /* Only take pages on the LRU. */
+ if (!PageLRU(page))
+ return ret;
+
+ /* Compaction should not handle unevictable pages but CMA can do so */
+ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
+ return ret;
+
+ ret = -EBUSY;
+
+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & ISOLATE_ASYNC_MIGRATE) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+ bool migrate_dirty;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
+ */
+ if (!trylock_page(page))
+ return ret;
+
+ mapping = page_mapping(page);
+ migrate_dirty = !mapping || mapping->a_ops->migratepage;
+ unlock_page(page);
+ if (!migrate_dirty)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+
+/*
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a sanity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+ enum lru_list lru, unsigned long *nr_zone_taken)
+{
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ if (!nr_zone_taken[zid])
+ continue;
+
+ update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+ }
+
+}
+
+/**
+ * pgdat->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan: The number of eligible pages to look through on the list.
+ * @lruvec: The LRU vector to pull pages from.
+ * @dst: The temp list to put pages on to.
+ * @nr_scanned: The number of pages that were scanned.
+ * @sc: The scan_control struct for this reclaim session
+ * @lru: LRU list id for isolating
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct list_head *dst,
+ unsigned long *nr_scanned, struct scan_control *sc,
+ enum lru_list lru)
+{
+ struct list_head *src = &lruvec->lists[lru];
+ unsigned long nr_taken = 0;
+ unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+ unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+ unsigned long skipped = 0;
+ unsigned long scan, total_scan, nr_pages;
+ LIST_HEAD(pages_skipped);
+ isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+
+ total_scan = 0;
+ scan = 0;
+ while (scan < nr_to_scan && !list_empty(src)) {
+ struct page *page;
+
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
+
+ nr_pages = compound_nr(page);
+ total_scan += nr_pages;
+
+ if (page_zonenum(page) > sc->reclaim_idx) {
+ list_move(&page->lru, &pages_skipped);
+ nr_skipped[page_zonenum(page)] += nr_pages;
+ continue;
+ }
+
+ /*
+ * Do not count skipped pages because that makes the function
+ * return with no isolated pages if the LRU mostly contains
+ * ineligible pages. This causes the VM to not reclaim any
+ * pages, triggering a premature OOM.
+ *
+ * Account all tail pages of THP. This would not cause
+ * premature OOM since __isolate_lru_page() returns -EBUSY
+ * only when the page is being freed somewhere else.
+ */
+ scan += nr_pages;
+ switch (__isolate_lru_page(page, mode)) {
+ case 0:
+ nr_taken += nr_pages;
+ nr_zone_taken[page_zonenum(page)] += nr_pages;
+ list_move(&page->lru, dst);
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+
+ default:
+ BUG();
+ }
+ }
+
+ /*
+ * Splice any skipped pages to the start of the LRU list. Note that
+ * this disrupts the LRU order when reclaiming for lower zones but
+ * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
+ * scanning would soon rescan the same pages to skip and put the
+ * system at risk of premature OOM.
+ */
+ if (!list_empty(&pages_skipped)) {
+ int zid;
+
+ list_splice(&pages_skipped, src);
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ if (!nr_skipped[zid])
+ continue;
+
+ __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+ skipped += nr_skipped[zid];
+ }
+ }
+ *nr_scanned = total_scan;
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
+ total_scan, skipped, nr_taken, mode, lru);
+ update_lru_sizes(lruvec, lru, nr_zone_taken);
+ return nr_taken;
+}
+
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared. If it was found on
+ * the active list, it will have PageActive set. If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ *
+ * (1) Must be called with an elevated refcount on the page. This is a
+ * fundamental difference from isolate_lru_pages (which is called
+ * without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+ int ret = -EBUSY;
+
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
+
+ if (PageLRU(page)) {
+ pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;
+
+ spin_lock_irq(&pgdat->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ if (PageLRU(page)) {
+ int lru = page_lru(page);
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, lru);
+ ret = 0;
+ }
+ spin_unlock_irq(&pgdat->lru_lock);
+ }
+ return ret;
+}
+
+/*
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get rescheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
+ */
+static int too_many_isolated(struct pglist_data *pgdat, int file,
+ struct scan_control *sc)
+{
+ unsigned long inactive, isolated;
+
+ if (current_is_kswapd())
+ return 0;
+
+ if (!writeback_throttling_sane(sc))
+ return 0;
+
+ if (file) {
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
+ } else {
+ inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
+{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
+ LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
+
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
+ spin_unlock_irq(&pgdat->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&pgdat->lru_lock);
+ continue;
+ }
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
+ SetPageLRU(page);
+ lru = page_lru(page);
+
+ nr_pages = thp_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
+ if (put_page_testzero(page)) {
+ __ClearPageLRU(page);
+ __ClearPageActive(page);
+ del_page_from_lru_list(page, lruvec, lru);
+
+ if (unlikely(PageCompound(page))) {
+ spin_unlock_irq(&pgdat->lru_lock);
+ destroy_compound_page(page);
+ spin_lock_irq(&pgdat->lru_lock);
+ } else
+ list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
+ if (PageActive(page))
+ workingset_age_nonresident(lruvec, nr_pages);
+ }
+ }
+
+ /*
+ * To save our caller's stack, now use input list for pages to free.
+ */
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
+}
+
+/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested. In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+ return !(current->flags & PF_LOCAL_THROTTLE) ||
+ current->backing_dev_info == NULL ||
+ bdi_write_congested(current->backing_dev_info);
+}
+
+/*
+ * shrink_inactive_list() is a helper for shrink_node(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned int nr_reclaimed = 0;
+ unsigned long nr_taken;
+ struct reclaim_stat stat;
+ bool file = is_file_lru(lru);
+ enum vm_event_item item;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ bool stalled = false;
+
+ while (unlikely(too_many_isolated(pgdat, file, sc))) {
+ if (stalled)
+ return 0;
+
+ /* wait a bit for the reclaimer. */
+ msleep(100);
+ stalled = true;
+
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
+
+ lru_add_drain();
+
+ spin_lock_irq(&pgdat->lru_lock);
+
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ &nr_scanned, sc, lru);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_ANON + file, nr_scanned);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ if (nr_taken == 0)
+ return 0;
+
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
+
+ spin_lock_irq(&pgdat->lru_lock);
+
+ move_pages_to_lru(lruvec, &page_list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ lru_note_cost(lruvec, file, stat.nr_pageout);
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ mem_cgroup_uncharge_list(&page_list);
+ free_unref_page_list(&page_list);
+
+ /*
+ * If dirty pages are scanned that are not queued for IO, it
+ * implies that flushers are not doing their job. This can
+ * happen when memory pressure pushes dirty pages to the end of
+ * the LRU before the dirty limits are breached and the dirty
+ * data has expired. It can also happen when the proportion of
+ * dirty pages grows not through writes but through memory
+ * pressure reclaiming all the clean cache. And in some cases,
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep.
+ */
+ if (stat.nr_unqueued_dirty == nr_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
+ sc->nr.dirty += stat.nr_dirty;
+ sc->nr.congested += stat.nr_congested;
+ sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+ sc->nr.writeback += stat.nr_writeback;
+ sc->nr.immediate += stat.nr_immediate;
+ sc->nr.taken += nr_taken;
+ if (file)
+ sc->nr.file_taken += nr_taken;
+
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ nr_scanned, nr_reclaimed, &stat, sc->priority, file);
+ return nr_reclaimed;
+}
+
+static void shrink_active_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec,
+ struct scan_control *sc,
+ enum lru_list lru)
+{
+ unsigned long nr_taken;
+ unsigned long nr_scanned;
+ unsigned long vm_flags;
+ LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_active);
+ LIST_HEAD(l_inactive);
+ struct page *page;
+ unsigned nr_deactivate, nr_activate;
+ unsigned nr_rotated = 0;
+ int file = is_file_lru(lru);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ lru_add_drain();
+
+ spin_lock_irq(&pgdat->lru_lock);
+
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ &nr_scanned, sc, lru);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
+
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ while (!list_empty(&l_hold)) {
+ cond_resched();
+ page = lru_to_page(&l_hold);
+ list_del(&page->lru);
+
+ if (unlikely(!page_evictable(page))) {
+ putback_lru_page(page);
+ continue;
+ }
+
+ if (unlikely(buffer_heads_over_limit)) {
+ if (page_has_private(page) && trylock_page(page)) {
+ if (page_has_private(page))
+ try_to_release_page(page, 0);
+ unlock_page(page);
+ }
+ }
+
+ if (page_referenced(page, 0, sc->target_mem_cgroup,
+ &vm_flags)) {
+ /*
+ * Identify referenced, file-backed active pages and
+ * give them one more trip around the active list. So
+ * that executable code get better chances to stay in
+ * memory under moderate memory pressure. Anon pages
+ * are not likely to be evicted by use-once streaming
+ * IO, plus JVM can create lots of anon VM_EXEC pages,
+ * so we ignore them here.
+ */
+ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
+ nr_rotated += thp_nr_pages(page);
+ list_add(&page->lru, &l_active);
+ continue;
+ }
+ }
+
+ ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
+ list_add(&page->lru, &l_inactive);
+ }
+
+ /*
+ * Move pages back to the lru list.
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
+ trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
+ nr_deactivate, nr_rotated, sc->priority, file);
+}
+
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = NUMA_NO_NODE;
+ unsigned int nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == NUMA_NO_NODE) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = NUMA_NO_NODE;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc)
+{
+ if (is_active_lru(lru)) {
+ if (sc->may_deactivate & (1 << is_file_lru(lru)))
+ shrink_active_list(nr_to_scan, lruvec, sc, lru);
+ else
+ sc->skipped_deactivate = 1;
+ return 0;
+ }
+
+ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
+}
+
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
+ *
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
+ *
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
+ *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. An inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ *
+ * total target max
+ * memory ratio inactive
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
+ */
+static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
+{
+ enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
+ unsigned long inactive, active;
+ unsigned long inactive_ratio;
+ unsigned long gb;
+
+ inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
+ active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
+}
+
+enum scan_balance {
+ SCAN_EQUAL,
+ SCAN_FRACT,
+ SCAN_ANON,
+ SCAN_FILE,
+};
+
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned. The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
+ */
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long *nr)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ unsigned long anon_cost, file_cost, total_cost;
+ int swappiness = mem_cgroup_swappiness(memcg);
+ u64 fraction[ANON_AND_FILE];
+ u64 denominator = 0; /* gcc */
+ enum scan_balance scan_balance;
+ unsigned long ap, fp;
+ enum lru_list lru;
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ /*
+ * Global reclaim will swap to prevent OOM even with no
+ * swappiness, but memcg users want to use this knob to
+ * disable swapping for individual groups completely when
+ * using the memory controller's swap limit feature would be
+ * too expensive.
+ */
+ if (cgroup_reclaim(sc) && !swappiness) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ /*
+ * Do not apply any pressure balancing cleverness when the
+ * system is close to OOM, scan both anon and file equally
+ * (unless the swappiness setting disagrees with swapping).
+ */
+ if (!sc->priority && swappiness) {
+ scan_balance = SCAN_EQUAL;
+ goto out;
+ }
+
+ /*
+ * If the system is almost out of file pages, force-scan anon.
+ */
+ if (sc->file_is_tiny) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
+ /*
+ * If there is enough inactive page cache, we do not reclaim
+ * anything from the anonymous working right now.
+ */
+ if (sc->cache_trim_mode) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ scan_balance = SCAN_FRACT;
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (200 - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[0] = ap;
+ fraction[1] = fp;
+ denominator = ap + fp;
+out:
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long lruvec_size;
+ unsigned long low, min;
+ unsigned long scan;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+ &min, &low);
+
+ if (min || low) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ (cgroup_size + 1);
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decrementing
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
+
+ /*
+ * If the cgroup's already been deleted, make sure to
+ * scrape out the remaining cache.
+ */
+ if (!scan && !mem_cgroup_online(memcg))
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
+
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page on
+ * the offlined memory cgroups because of a
+ * round-off error.
+ */
+ scan = mem_cgroup_online(memcg) ?
+ div64_u64(scan * fraction[file], denominator) :
+ DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+
+ nr[lru] = scan;
+ }
+}
+
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long targets[NR_LRU_LISTS];
+ unsigned long nr_to_scan;
+ enum lru_list lru;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
+ struct blk_plug plug;
+
+ get_scan_count(lruvec, sc, nr);
+
+ /* Record the original scan target for proportional adjustments later */
+ memcpy(targets, nr, sizeof(nr));
+
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+ unsigned long nr_anon, nr_file, percentage;
+ unsigned long nr_scanned;
+
+ for_each_evictable_lru(lru) {
+ if (nr[lru]) {
+ nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+ nr[lru] -= nr_to_scan;
+
+ nr_reclaimed += shrink_list(lru, nr_to_scan,
+ lruvec, sc);
+ }
+ }
+
+ cond_resched();
+
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
+ continue;
+
+ /*
+ * For kswapd and memcg, reclaim at least the number of pages
+ * requested. Ensure that the anon and file LRUs are scanned
+ * proportionally what was requested by get_scan_count(). We
+ * stop reclaiming one LRU and reduce the amount scanning
+ * proportional to the original scan target.
+ */
+ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+ nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
+ if (nr_file > nr_anon) {
+ unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+ targets[LRU_ACTIVE_ANON] + 1;
+ lru = LRU_BASE;
+ percentage = nr_anon * 100 / scan_target;
+ } else {
+ unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
+ targets[LRU_ACTIVE_FILE] + 1;
+ lru = LRU_FILE;
+ percentage = nr_file * 100 / scan_target;
+ }
+
+ /* Stop scanning the smaller of the LRU */
+ nr[lru] = 0;
+ nr[lru + LRU_ACTIVE] = 0;
+
+ /*
+ * Recalculate the other LRU scan count based on its original
+ * scan target and the percentage scanning already complete
+ */
+ lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
+ nr_scanned = targets[lru] - nr[lru];
+ nr[lru] = targets[lru] * (100 - percentage) / 100;
+ nr[lru] -= min(nr[lru], nr_scanned);
+
+ lru += LRU_ACTIVE;
+ nr_scanned = targets[lru] - nr[lru];
+ nr[lru] = targets[lru] * (100 - percentage) / 100;
+ nr[lru] -= min(nr[lru], nr_scanned);
+ }
+ blk_finish_plug(&plug);
+ sc->nr_reclaimed += nr_reclaimed;
+
+ /*
+ * Even if we did not try to evict anon pages at all, we want to
+ * rebalance the anon lru active/inactive ratio.
+ */
+ if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
+}
+
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(struct scan_control *sc)
+{
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+ sc->priority < DEF_PRIORITY - 2))
+ return true;
+
+ return false;
+}
+
+/*
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
+ * true if more pages should be reclaimed such that when the page allocator
+ * calls try_to_compact_pages() that it will have enough free pages to succeed.
+ * It will give up earlier than that if there is difficulty reclaiming pages.
+ */
+static inline bool should_continue_reclaim(struct pglist_data *pgdat,
+ unsigned long nr_reclaimed,
+ struct scan_control *sc)
+{
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
+ int z;
+
+ /* If not in reclaim/compaction mode, stop */
+ if (!in_reclaim_compaction(sc))
+ return false;
+
+ /*
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
+ */
+ if (!nr_reclaimed)
+ return false;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!managed_zone(zone))
+ continue;
+
+ switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+ case COMPACT_SUCCESS:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ /* check next zone */
+ ;
+ }
+ }
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
+}
+
+static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ unsigned long reclaimed;
+ unsigned long scanned;
+
+ /*
+ * This loop can become CPU-bound when target memcgs
+ * aren't eligible for reclaim - either because they
+ * don't have any reclaimable pages, or because their
+ * memory is explicitly protected. Avoid soft lockups.
+ */
+ cond_resched();
+
+ mem_cgroup_calculate_protection(target_memcg, memcg);
+
+ if (mem_cgroup_below_min(memcg)) {
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+ } else if (mem_cgroup_below_low(memcg)) {
+ /*
+ * Soft protection.
+ * Respect the protection only as long as
+ * there is an unprotected supply
+ * of reclaimable memory from other cgroups.
+ */
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
+ continue;
+ }
+ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ reclaimed = sc->nr_reclaimed;
+ scanned = sc->nr_scanned;
+
+ shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+ sc->priority);
+
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
+}
+
+static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ unsigned long nr_reclaimed, nr_scanned;
+ struct lruvec *target_lruvec;
+ bool reclaimable = false;
+ unsigned long file;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+again:
+ memset(&sc->nr, 0, sizeof(sc->nr));
+
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+
+ shrink_node_memcgs(pgdat, sc);
+
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
+ if (sc->nr_reclaimed - nr_reclaimed)
+ reclaimable = true;
+
+ if (current_is_kswapd()) {
+ /*
+ * If reclaim is isolating dirty pages under writeback,
+ * it implies that the long-lived page allocation rate
+ * is exceeding the page laundering rate. Either the
+ * global limits are not being effective at throttling
+ * processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing
+ * device. The only option is to throttle from reclaim
+ * context which is not ideal as there is no guarantee
+ * the dirtying process is throttled in the same way
+ * balance_dirty_pages() manages.
+ *
+ * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+ * count the number of pages under pages flagged for
+ * immediate reclaim and stall if any are encountered
+ * in the nr_immediate check below.
+ */
+ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+ /* Allow kswapd to start writing pages during reclaim.*/
+ if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+ /*
+ * If kswapd scans pages marked for immediate
+ * reclaim and under writeback (nr_immediate), it
+ * implies that pages are cycling through the LRU
+ * faster than they are written so also forcibly stall.
+ */
+ if (sc->nr.immediate)
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ /*
+ * Tag a node/memcg as congested if all the dirty pages
+ * scanned were backed by a congested BDI and
+ * wait_iff_congested will stall.
+ *
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling in wait_iff_congested().
+ */
+ if ((current_is_kswapd() ||
+ (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
+ sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+ set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+
+ /*
+ * Stall direct reclaim for IO completions if underlying BDIs
+ * and node is congested. Allow kswapd to continue until it
+ * starts encountering unqueued dirty pages or cycling through
+ * the LRU too quickly.
+ */
+ if (!current_is_kswapd() && current_may_throttle() &&
+ !sc->hibernation_mode &&
+ test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
+ wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
+ if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+ sc))
+ goto again;
+
+ /*
+ * Kswapd gives up on balancing particular nodes after too
+ * many failures to reclaim anything from them and goes to
+ * sleep. On reclaim progress, reset the failure counter. A
+ * successful direct reclaim run will revive a dormant kswapd.
+ */
+ if (reclaimable)
+ pgdat->kswapd_failures = 0;
+}
+
+/*
+ * Returns true if compaction should go ahead for a costly-order request, or
+ * the allocation would already succeed without compaction. Return false if we
+ * should reclaim first.
+ */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long watermark;
+ enum compact_result suitable;
+
+ suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
+ if (suitable == COMPACT_SUCCESS)
+ /* Allocation should succeed already. Don't reclaim. */
+ return true;
+ if (suitable == COMPACT_SKIPPED)
+ /* Compaction cannot yet proceed. Do reclaim. */
+ return false;
+
+ /*
+ * Compaction is already possible, but it takes time to run and there
+ * are potentially other callers using the pages just freed. So proceed
+ * with reclaim to make a buffer of free pages available to give
+ * compaction a reasonable chance of completing and allocating the page.
+ * Note that we won't actually reclaim the whole buffer in one attempt
+ * as the target watermark in should_continue_reclaim() is lower. But if
+ * we are already above the high+gap watermark, don't reclaim at all.
+ */
+ watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+
+ return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+}
+
+/*
+ * This is the direct reclaim path, for page-allocating processes. We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
+ */
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
+ gfp_t orig_mask;
+ pg_data_t *last_pgdat = NULL;
+
+ /*
+ * If the number of buffer_heads in the machine exceeds the maximum
+ * allowed level, force direct reclaim to scan the highmem zone as
+ * highmem pages could be pinning lowmem pages storing buffer_heads
+ */
+ orig_mask = sc->gfp_mask;
+ if (buffer_heads_over_limit) {
+ sc->gfp_mask |= __GFP_HIGHMEM;
+ sc->reclaim_idx = gfp_zone(sc->gfp_mask);
+ }
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ sc->reclaim_idx, sc->nodemask) {
+ /*
+ * Take care memory controller reclaiming has small influence
+ * to global LRU.
+ */
+ if (!cgroup_reclaim(sc)) {
+ if (!cpuset_zone_allowed(zone,
+ GFP_KERNEL | __GFP_HARDWALL))
+ continue;
+
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticeable problem, like transparent huge
+ * page allocations.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ compaction_ready(zone, sc)) {
+ sc->compaction_ready = true;
+ continue;
+ }
+
+ /*
+ * Shrink each node in the zonelist once. If the
+ * zonelist is ordered by zone (not the default) then a
+ * node may be shrunk multiple times but in that case
+ * the user prefers lower zones being preserved.
+ */
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
+ }
+
+ /* See comment about same check for global reclaim above */
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ shrink_node(zone->zone_pgdat, sc);
+ }
+
+ /*
+ * Restore to original mask to avoid the impact on the caller if we
+ * promoted it to __GFP_HIGHMEM.
+ */
+ sc->gfp_mask = orig_mask;
+}
+
+static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
+{
+ struct lruvec *target_lruvec;
+ unsigned long refaults;
+
+ target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
+ target_lruvec->refaults[1] = refaults;
+}
+
+/*
+ * This is the main entry point to direct page reclaim.
+ *
+ * If a full scan of the inactive list fails to free enough memory then we
+ * are "out of memory" and something needs to be killed.
+ *
+ * If the caller is !__GFP_FS then the probability of a failure is reasonably
+ * high - the zone may be full of dirty or under-writeback pages, which this
+ * caller can't do much about. We kick the writeback threads and take explicit
+ * naps in the hope that some of these pages can be written. But if the
+ * allocating task holds filesystem locks which prevent writeout this might not
+ * work, and the allocation attempt will fail.
+ *
+ * returns: 0, if no pages reclaimed
+ * else, the number of pages reclaimed
+ */
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ int initial_priority = sc->priority;
+ pg_data_t *last_pgdat;
+ struct zoneref *z;
+ struct zone *zone;
+retry:
+ delayacct_freepages_start();
+
+ if (!cgroup_reclaim(sc))
+ __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
+
+ do {
+ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->priority);
+ sc->nr_scanned = 0;
+ shrink_zones(zonelist, sc);
+
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim)
+ break;
+
+ if (sc->compaction_ready)
+ break;
+
+ /*
+ * If we're getting trouble reclaiming, start doing
+ * writepage even in laptop mode.
+ */
+ if (sc->priority < DEF_PRIORITY - 2)
+ sc->may_writepage = 1;
+ } while (--sc->priority >= 0);
+
+ last_pgdat = NULL;
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+ sc->nodemask) {
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+
+ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+
+ if (cgroup_reclaim(sc)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
+ zone->zone_pgdat);
+ clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ }
+ }
+
+ delayacct_freepages_end();
+
+ if (sc->nr_reclaimed)
+ return sc->nr_reclaimed;
+
+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (sc->compaction_ready)
+ return 1;
+
+ /*
+ * We make inactive:active ratio decisions based on the node's
+ * composition of memory, but a restrictive reclaim_idx or a
+ * memory.low cgroup setting can exempt large amounts of
+ * memory from reclaim. Neither of which are very common, so
+ * instead of doing costly eligibility calculations of the
+ * entire cgroup subtree up front, we assume the estimates are
+ * good, and retry with forcible deactivation if that fails.
+ */
+ if (sc->skipped_deactivate) {
+ sc->priority = initial_priority;
+ sc->force_deactivate = 1;
+ sc->skipped_deactivate = 0;
+ goto retry;
+ }
+
+ /* Untapped cgroup reserves? Don't OOM, retry. */
+ if (sc->memcg_low_skipped) {
+ sc->priority = initial_priority;
+ sc->force_deactivate = 0;
+ sc->memcg_low_reclaim = 1;
+ sc->memcg_low_skipped = 0;
+ goto retry;
+ }
+
+ return 0;
+}
+
+static bool allow_direct_reclaim(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ unsigned long pfmemalloc_reserve = 0;
+ unsigned long free_pages = 0;
+ int i;
+ bool wmark_ok;
+
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
+
+ for (i = 0; i <= ZONE_NORMAL; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!managed_zone(zone))
+ continue;
+
+ if (!zone_reclaimable_pages(zone))
+ continue;
+
+ pfmemalloc_reserve += min_wmark_pages(zone);
+ free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
+ wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+ /* kswapd must be awake if processes are being throttled */
+ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+ if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
+
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+ return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
+ */
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ pg_data_t *pgdat = NULL;
+
+ /*
+ * Kernel threads should not be throttled as they may be indirectly
+ * responsible for cleaning pages necessary for reclaim to make forward
+ * progress. kjournald for example may enter direct reclaim while
+ * committing a transaction where throttling it could forcing other
+ * processes to block on log_wait_commit().
+ */
+ if (current->flags & PF_KTHREAD)
+ goto out;
+
+ /*
+ * If a fatal signal is pending, this process should not throttle.
+ * It should return quickly so it can exit and free its memory
+ */
+ if (fatal_signal_pending(current))
+ goto out;
+
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(gfp_mask), nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (allow_direct_reclaim(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
+ goto out;
+
+ /* Account for the throttling */
+ count_vm_event(PGSCAN_DIRECT_THROTTLE);
+
+ /*
+ * If the caller cannot enter the filesystem, it's possible that it
+ * is due to the caller holding an FS lock or performing a journal
+ * transaction in the case of a filesystem like ext[3|4]. In this case,
+ * it is not safe to block on pfmemalloc_wait as kswapd could be
+ * blocked waiting on the same lock. Instead, throttle for up to a
+ * second before continuing.
+ */
+ if (!(gfp_mask & __GFP_FS)) {
+ wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat), HZ);
+
+ goto check_pending;
+ }
+
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
+
+check_pending:
+ if (fatal_signal_pending(current))
+ return true;
+
+out:
+ return false;
+}
+
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask, nodemask_t *nodemask)
+{
+ unsigned long nr_reclaimed;
+ struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .reclaim_idx = gfp_zone(gfp_mask),
+ .order = order,
+ .nodemask = nodemask,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ /*
+ * scan_control uses s8 fields for order, priority, and reclaim_idx.
+ * Confirm they are large enough for max values.
+ */
+ BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
+ BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
+
+ /*
+ * Do not enter reclaim if fatal signal was delivered while throttled.
+ * 1 is returned so that the page allocator does not OOM kill at this
+ * point.
+ */
+ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
+ return 1;
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
+
+ return nr_reclaimed;
+}
+
+#ifdef CONFIG_MEMCG
+
+/* Only used by soft limit reclaim. Do not reuse for anything else. */
+unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
+ gfp_t gfp_mask, bool noswap,
+ pg_data_t *pgdat,
+ unsigned long *nr_scanned)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .target_mem_cgroup = memcg,
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .may_swap = !noswap,
+ };
+
+ WARN_ON_ONCE(!current->reclaim_state);
+
+ sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
+ sc.gfp_mask);
+
+ /*
+ * NOTE: Although we can get the priority field, using it
+ * here is not a good idea, since it limits the pages we can scan.
+ * if we don't reclaim here, the shrink_node from balance_pgdat
+ * will pick up pages from other mem cgroup's as well. We hack
+ * the priority and make it zero.
+ */
+ shrink_lruvec(lruvec, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
+ *nr_scanned = sc.nr_scanned;
+
+ return sc.nr_reclaimed;
+}
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ bool may_swap)
+{
+ unsigned long nr_reclaimed;
+ unsigned int noreclaim_flag;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .target_mem_cgroup = memcg,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = may_swap,
+ };
+ /*
+ * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
+ * equal pressure on all the nodes. This is based on the assumption that
+ * the reclaim does not bail out early.
+ */
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
+
+ return nr_reclaimed;
+}
+#endif
+
+static void age_active_anon(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ if (!total_swap_pages)
+ return;
+
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ return;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ } while (memcg);
+}
+
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
+{
+ int i;
+ struct zone *zone;
+
+ /*
+ * Check for watermark boosts top-down as the higher zones
+ * are more likely to be boosted. Both watermarks and boosts
+ * should not be checked at the same time as reclaim would
+ * start prematurely when there is no boosting and a lower
+ * zone is balanced.
+ */
+ for (i = highest_zoneidx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ if (zone->watermark_boost)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and highest_zoneidx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
+{
+ int i;
+ unsigned long mark = -1;
+ struct zone *zone;
+
+ /*
+ * Check watermarks bottom-up as lower zones are more likely to
+ * meet watermarks.
+ */
+ for (i = 0; i <= highest_zoneidx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+ return true;
+ }
+
+ /*
+ * If a node has no populated zone within highest_zoneidx, it does not
+ * need balancing by definition. This can happen if a zone-restricted
+ * allocation tries to wake a remote kswapd.
+ */
+ if (mark == -1)
+ return true;
+
+ return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
+}
+
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
+ int highest_zoneidx)
+{
+ /*
+ * The throttled processes are normally woken up in balance_pgdat() as
+ * soon as allow_direct_reclaim() is true. But there is a potential
+ * race between when kswapd checks the watermarks and a process gets
+ * throttled. There is also a potential race if processes get
+ * throttled, kswapd wakes, a large process exits thereby balancing the
+ * zones, which causes kswapd to exit balance_pgdat() before reaching
+ * the wake up checks. If kswapd is going to sleep, no process should
+ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+ * the wake up is premature, processes will wake kswapd and get
+ * throttled again. The difference from wake ups in balance_pgdat() is
+ * that here we are under prepare_to_wait().
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ wake_up_all(&pgdat->pfmemalloc_wait);
+
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
+
+ if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
+ clear_pgdat_congested(pgdat);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * kswapd shrinks a node of pages that are at or below the highest usable
+ * zone that is currently unbalanced.
+ *
+ * Returns true if kswapd scanned at least the requested number of pages to
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
+ */
+static bool kswapd_shrink_node(pg_data_t *pgdat,
+ struct scan_control *sc)
+{
+ struct zone *zone;
+ int z;
+
+ /* Reclaim a number of pages proportional to the number of zones */
+ sc->nr_to_reclaim = 0;
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ zone = pgdat->node_zones + z;
+ if (!managed_zone(zone))
+ continue;
+
+ sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+ }
+
+ /*
+ * Historically care was taken to put equal pressure on all zones but
+ * now pressure is applied based on node LRU order.
+ */
+ shrink_node(pgdat, sc);
+
+ /*
+ * Fragmentation may mean that the system cannot be rebalanced for
+ * high-order allocations. If twice the allocation size has been
+ * reclaimed then recheck watermarks only at order-0 to prevent
+ * excessive reclaim. Assume that a process requested a high-order
+ * can direct reclaim/compact.
+ */
+ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
+ sc->order = 0;
+
+ return sc->nr_scanned >= sc->nr_to_reclaim;
+}
+
+/*
+ * For kswapd, balance_pgdat() will reclaim pages across a node from zones
+ * that are eligible for use by the caller until at least one zone is
+ * balanced.
+ *
+ * Returns the order kswapd finished reclaiming at.
+ *
+ * kswapd scans the zones in the highmem->normal->dma direction. It skips
+ * zones which have free_pages > high_wmark_pages(zone), but once a zone is
+ * found to have free_pages <= high_wmark_pages(zone), any page in that zone
+ * or lower is eligible for reclaim until at least one usable zone is
+ * balanced.
+ */
+static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+{
+ int i;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
+ unsigned long pflags;
+ unsigned long nr_boost_reclaim;
+ unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+ bool boosted;
+ struct zone *zone;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .order = order,
+ .may_unmap = 1,
+ };
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ psi_memstall_enter(&pflags);
+ __fs_reclaim_acquire();
+
+ count_vm_event(PAGEOUTRUN);
+
+ /*
+ * Account for the reclaim boost. Note that the zone boost is left in
+ * place so that parallel allocations that are near the watermark will
+ * stall or direct reclaim until kswapd is finished.
+ */
+ nr_boost_reclaim = 0;
+ for (i = 0; i <= highest_zoneidx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ nr_boost_reclaim += zone->watermark_boost;
+ zone_boosts[i] = zone->watermark_boost;
+ }
+ boosted = nr_boost_reclaim;
+
+restart:
+ sc.priority = DEF_PRIORITY;
+ do {
+ unsigned long nr_reclaimed = sc.nr_reclaimed;
+ bool raise_priority = true;
+ bool balanced;
+ bool ret;
+
+ sc.reclaim_idx = highest_zoneidx;
+
+ /*
+ * If the number of buffer_heads exceeds the maximum allowed
+ * then consider reclaiming from all zones. This has a dual
+ * purpose -- on 64-bit systems it is expected that
+ * buffer_heads are stripped during active rotation. On 32-bit
+ * systems, highmem pages can pin lowmem memory and shrinking
+ * buffers can relieve lowmem pressure. Reclaim may still not
+ * go ahead if all eligible zones for the original allocation
+ * request are balanced to avoid excessive reclaim from kswapd.
+ */
+ if (buffer_heads_over_limit) {
+ for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ sc.reclaim_idx = i;
+ break;
+ }
+ }
+
+ /*
+ * If the pgdat is imbalanced then ignore boosting and preserve
+ * the watermarks for a later time and restart. Note that the
+ * zone watermarks will be still reset at the end of balancing
+ * on the grounds that the normal reclaim should be enough to
+ * re-evaluate if boosting is required when kswapd next wakes.
+ */
+ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
+ if (!balanced && nr_boost_reclaim) {
+ nr_boost_reclaim = 0;
+ goto restart;
+ }
+
+ /*
+ * If boosting is not active then only reclaim if there are no
+ * eligible zones. Note that sc.reclaim_idx is not used as
+ * buffer_heads_over_limit may have adjusted it.
+ */
+ if (!nr_boost_reclaim && balanced)
+ goto out;
+
+ /* Limit the priority of boosting to avoid reclaim writeback */
+ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+ raise_priority = false;
+
+ /*
+ * Do not writeback or swap pages for boosted reclaim. The
+ * intent is to relieve pressure not issue sub-optimal IO
+ * from reclaim context. If no pages are reclaimed, the
+ * reclaim will be aborted.
+ */
+ sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_swap = !nr_boost_reclaim;
+
+ /*
+ * Do some background aging of the anon list, to give
+ * pages a chance to be referenced before reclaiming. All
+ * pages are rotated regardless of classzone as this is
+ * about consistent aging.
+ */
+ age_active_anon(pgdat, &sc);
+
+ /*
+ * If we're getting trouble reclaiming, start doing writepage
+ * even in laptop mode.
+ */
+ if (sc.priority < DEF_PRIORITY - 2)
+ sc.may_writepage = 1;
+
+ /* Call soft limit reclaim before calling shrink_node. */
+ sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
+ sc.gfp_mask, &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+
+ /*
+ * There should be no need to raise the scanning priority if
+ * enough pages are already being scanned that that high
+ * watermark would be met at 100% efficiency.
+ */
+ if (kswapd_shrink_node(pgdat, &sc))
+ raise_priority = false;
+
+ /*
+ * If the low watermark is met there is no need for processes
+ * to be throttled on pfmemalloc_wait as they should not be
+ * able to safely make forward progress. Wake them
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+ allow_direct_reclaim(pgdat))
+ wake_up_all(&pgdat->pfmemalloc_wait);
+
+ /* Check if kswapd should be suspending */
+ __fs_reclaim_release();
+ ret = try_to_freeze();
+ __fs_reclaim_acquire();
+ if (ret || kthread_should_stop())
+ break;
+
+ /*
+ * Raise priority if scanning rate is too low or there was no
+ * progress in reclaiming pages
+ */
+ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+ /*
+ * If reclaim made no progress for a boost, stop reclaim as
+ * IO cannot be queued and it could be an infinite loop in
+ * extreme circumstances.
+ */
+ if (nr_boost_reclaim && !nr_reclaimed)
+ break;
+
+ if (raise_priority || !nr_reclaimed)
+ sc.priority--;
+ } while (sc.priority >= 1);
+
+ if (!sc.nr_reclaimed)
+ pgdat->kswapd_failures++;
+
+out:
+ /* If reclaim was boosted, account for the reclaim done in this pass */
+ if (boosted) {
+ unsigned long flags;
+
+ for (i = 0; i <= highest_zoneidx; i++) {
+ if (!zone_boosts[i])
+ continue;
+
+ /* Increments are under the zone lock */
+ zone = pgdat->node_zones + i;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /*
+ * As there is now likely space, wakeup kcompact to defragment
+ * pageblocks.
+ */
+ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
+ }
+
+ snapshot_refaults(NULL, pgdat);
+ __fs_reclaim_release();
+ psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
+ /*
+ * Return the order kswapd stopped reclaiming at as
+ * prepare_kswapd_sleep() takes it into account. If another caller
+ * entered the allocator slow path while kswapd was awake, order will
+ * remain at the higher level.
+ */
+ return sc.order;
+}
+
+/*
+ * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
+ * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
+ * not a valid index then either kswapd runs for first time or kswapd couldn't
+ * sleep after previous reclaim attempt (node is still unbalanced). In that
+ * case return the zone index of the previous kswapd reclaim cycle.
+ */
+static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
+ enum zone_type prev_highest_zoneidx)
+{
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
+
+ return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+ unsigned int highest_zoneidx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * Try to sleep for a short interval. Note that kcompactd will only be
+ * woken if it is possible to sleep for a short interval. This is
+ * deliberate on the assumption that if reclaim cannot keep an
+ * eligible zone balanced that it's also unlikely that compaction will
+ * succeed.
+ */
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
+
+ remaining = schedule_timeout(HZ/10);
+
+ /*
+ * If woken prematurely then reset kswapd_highest_zoneidx and
+ * order. The values will either be from a wakeup request or
+ * the previous request that slept prematurely.
+ */
+ if (remaining) {
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
+ kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx));
+
+ if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
+ WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+ }
+
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!remaining &&
+ prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+
+ if (!kthread_should_stop())
+ schedule();
+
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
+}
+
+/*
+ * The background pageout daemon, started as a kernel thread
+ * from the init process.
+ *
+ * This basically trickles out pages so that we have _some_
+ * free memory available even if there is no other activity
+ * that frees anything up. This is needed for things like routing
+ * etc, where we otherwise might have all activity going on in
+ * asynchronous contexts that cannot page things out.
+ *
+ * If there are applications that are active memory-allocators
+ * (most normal use), this basically shouldn't matter.
+ */
+static int kswapd(void *p)
+{
+ unsigned int alloc_order, reclaim_order;
+ unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ /*
+ * Tell the memory management that we're a "memory allocator",
+ * and that if we need more memory we should get access to it
+ * regardless (see "__alloc_pages()"). "kswapd" should
+ * never get caught in the normal page freeing logic.
+ *
+ * (Kswapd normally doesn't need memory anyway, but sometimes
+ * you need a small amount of memory in order to be able to
+ * page out something else, and this flag essentially protects
+ * us from recursively trying to free more memory as we're
+ * trying to free the first piece of memory in the first place).
+ */
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ set_freezable();
+
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ for ( ; ; ) {
+ bool ret;
+
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
+
+kswapd_try_sleep:
+ kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
+ highest_zoneidx);
+
+ /* Read the new order and highest_zoneidx */
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+
+ ret = try_to_freeze();
+ if (kthread_should_stop())
+ break;
+
+ /*
+ * We can speed up thawing tasks if we don't call balance_pgdat
+ * after returning from the refrigerator
+ */
+ if (ret)
+ continue;
+
+ /*
+ * Reclaim begins at the requested order but if a high-order
+ * reclaim fails then kswapd falls back to reclaiming for
+ * order-0. If that happens, kswapd will consider sleeping
+ * for the order it finished reclaiming at (reclaim_order)
+ * but kcompactd is woken to compact for the original
+ * request (alloc_order).
+ */
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
+ alloc_order);
+ reclaim_order = balance_pgdat(pgdat, alloc_order,
+ highest_zoneidx);
+ if (reclaim_order < alloc_order)
+ goto kswapd_try_sleep;
+ }
+
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+
+ return 0;
+}
+
+/*
+ * A zone is low on free memory or too fragmented for high-order memory. If
+ * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
+ * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
+ * has failed or is not needed, still wake up kcompactd if only compaction is
+ * needed.
+ */
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+ enum zone_type highest_zoneidx)
+{
+ pg_data_t *pgdat;
+ enum zone_type curr_idx;
+
+ if (!managed_zone(zone))
+ return;
+
+ if (!cpuset_zone_allowed(zone, gfp_flags))
+ return;
+
+ pgdat = zone->zone_pgdat;
+ curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
+
+ if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
+
+ if (READ_ONCE(pgdat->kswapd_order) < order)
+ WRITE_ONCE(pgdat->kswapd_order, order);
+
+ if (!waitqueue_active(&pgdat->kswapd_wait))
+ return;
+
+ /* Hopeless node, leave it to direct reclaim if possible */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+ (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+ !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
+ /*
+ * There may be plenty of free memory available, but it's too
+ * fragmented for high-order allocations. Wake up kcompactd
+ * and rely on compaction_suitable() to determine if it's
+ * needed. If it fails, it will defer subsequent attempts to
+ * ratelimit its work.
+ */
+ if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);
+ return;
+ }
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
+ gfp_flags);
+ wake_up_interruptible(&pgdat->kswapd_wait);
+}
+
+#ifdef CONFIG_HIBERNATION
+/*
+ * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
+ */
+unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
+{
+ struct scan_control sc = {
+ .nr_to_reclaim = nr_to_reclaim,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .hibernation_mode = 1,
+ };
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ unsigned long nr_reclaimed;
+ unsigned int noreclaim_flag;
+
+ fs_reclaim_acquire(sc.gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
+ set_task_reclaim_state(current, &sc.reclaim_state);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ set_task_reclaim_state(current, NULL);
+ memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
+
+ return nr_reclaimed;
+}
+#endif /* CONFIG_HIBERNATION */
+
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kswapd)
+ return 0;
+
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kswapd);
+ pgdat->kswapd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kswapd_stop(int nid)
+{
+ struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+
+ if (kswapd) {
+ kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
+}
+
+static int __init kswapd_init(void)
+{
+ int nid;
+
+ swap_setup();
+ for_each_node_state(nid, N_MEMORY)
+ kswapd_run(nid);
+ return 0;
+}
+
+module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Node reclaim mode
+ *
+ * If non-zero call node_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int node_reclaim_mode __read_mostly;
+
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI. New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
+
+/*
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define NODE_RECLAIM_PRIORITY 4
+
+/*
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
+static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
+{
+ unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
+ unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_FILE);
+
+ /*
+ * It's possible for there to be more file mapped pages than
+ * accounted for by the pages on the file LRU lists because
+ * tmpfs pages accounted for as ANON can also be FILE_MAPPED
+ */
+ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
+}
+
+/* Work out how many page cache pages we can reclaim in this reclaim_mode */
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
+{
+ unsigned long nr_pagecache_reclaimable;
+ unsigned long delta = 0;
+
+ /*
+ * If RECLAIM_UNMAP is set, then all file pages are considered
+ * potentially reclaimable. Otherwise, we have to worry about
+ * pages like swapcache and node_unmapped_file_pages() provides
+ * a better estimate
+ */
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
+ else
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
+
+ /* If we can't clean pages, remove dirty pages from consideration */
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
+
+ /* Watch for any possible underflows due to delta */
+ if (unlikely(delta > nr_pagecache_reclaimable))
+ delta = nr_pagecache_reclaimable;
+
+ return nr_pagecache_reclaimable - delta;
+}
+
+/*
+ * Try to free up some pages from this node through reclaim.
+ */
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+{
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct task_struct *p = current;
+ unsigned int noreclaim_flag;
+ struct scan_control sc = {
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = current_gfp_context(gfp_mask),
+ .order = order,
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
+ .may_swap = 1,
+ .reclaim_idx = gfp_zone(gfp_mask),
+ };
+
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
+ cond_resched();
+ fs_reclaim_acquire(sc.gfp_mask);
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_UNMAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_UNMAP.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
+ p->flags |= PF_SWAPWRITE;
+ set_task_reclaim_state(p, &sc.reclaim_state);
+
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink node with increasing
+ * priorities until we have enough memory freed.
+ */
+ do {
+ shrink_node(pgdat, &sc);
+ } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
+ }
+
+ set_task_reclaim_state(p, NULL);
+ current->flags &= ~PF_SWAPWRITE;
+ memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
+ return sc.nr_reclaimed >= nr_pages;
+}
+
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+{
+ int ret;
+
+ /*
+ * Node reclaim reclaims unmapped file backed pages and
+ * slab pages if we are over the defined limits.
+ *
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
+ * unmapped file backed pages.
+ */
+ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+ pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
+
+ /*
+ * Do not scan if the allocation should not be delayed.
+ */
+ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
+ return NODE_RECLAIM_NOSCAN;
+
+ /*
+ * Only run node reclaim on the local node or on nodes that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
+
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
+
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+
+ if (!ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
+
+ return ret;
+}
+#endif
+
+/**
+ * check_move_unevictable_pages - check pages for evictability and move to
+ * appropriate zone lru list
+ * @pvec: pagevec with lru pages to check
+ *
+ * Checks pages for evictability, if an evictable page is in the unevictable
+ * lru list, moves it to the appropriate evictable lru list. This function
+ * should be only used for lru pages.
+ */
+void check_move_unevictable_pages(struct pagevec *pvec)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = NULL;
+ int pgscanned = 0;
+ int pgrescued = 0;
+ int i;
+
+ for (i = 0; i < pvec->nr; i++) {
+ struct page *page = pvec->pages[i];
+ struct pglist_data *pagepgdat = page_pgdat(page);
+ int nr_pages;
+
+ if (PageTransTail(page))
+ continue;
+
+ nr_pages = thp_nr_pages(page);
+ pgscanned += nr_pages;
+
+ if (pagepgdat != pgdat) {
+ if (pgdat)
+ spin_unlock_irq(&pgdat->lru_lock);
+ pgdat = pagepgdat;
+ spin_lock_irq(&pgdat->lru_lock);
+ }
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
+ if (!PageLRU(page) || !PageUnevictable(page))
+ continue;
+
+ if (page_evictable(page)) {
+ enum lru_list lru = page_lru_base_type(page);
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ ClearPageUnevictable(page);
+ del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, lru);
+ pgrescued += nr_pages;
+ }
+ }
+
+ if (pgdat) {
+ __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
+ spin_unlock_irq(&pgdat->lru_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 000000000..e292e63af
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,2182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/vmstat.c
+ *
+ * Manages VM statistics
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ *
+ * zoned VM statistics
+ * Copyright (C) 2006 Silicon Graphics, Inc.,
+ * Christoph Lameter <christoph@lameter.com>
+ * Copyright (C) 2008-2014 Christoph Lameter
+ */
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmstat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/page_ext.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
+#ifdef CONFIG_NUMA
+int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+
+/* zero numa counters within a zone */
+static void zero_zone_numa_counters(struct zone *zone)
+{
+ int item, cpu;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_stat[item], 0);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+ = 0;
+ }
+}
+
+/* zero numa counters of all the populated zones */
+static void zero_zones_numa_counters(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+static void zero_global_numa_counters(void)
+{
+ int item;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+ atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+static void invalid_numa_statistics(void)
+{
+ zero_zones_numa_counters();
+ zero_global_numa_counters();
+}
+
+static DEFINE_MUTEX(vm_numa_stat_lock);
+
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ int ret, oldval;
+
+ mutex_lock(&vm_numa_stat_lock);
+ if (write)
+ oldval = sysctl_vm_numa_stat;
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (ret || !write)
+ goto out;
+
+ if (oldval == sysctl_vm_numa_stat)
+ goto out;
+ else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
+ static_branch_enable(&vm_numa_stat_key);
+ pr_info("enable numa statistics\n");
+ } else {
+ static_branch_disable(&vm_numa_stat_key);
+ invalid_numa_statistics();
+ pr_info("disable numa statistics, and clear numa counters\n");
+ }
+
+out:
+ mutex_unlock(&vm_numa_stat_lock);
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
+EXPORT_PER_CPU_SYMBOL(vm_event_states);
+
+static void sum_vm_events(unsigned long *ret)
+{
+ int cpu;
+ int i;
+
+ memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+
+ for_each_online_cpu(cpu) {
+ struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ ret[i] += this->event[i];
+ }
+}
+
+/*
+ * Accumulate the vm event counters across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+*/
+void all_vm_events(unsigned long *ret)
+{
+ get_online_cpus();
+ sum_vm_events(ret);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(all_vm_events);
+
+/*
+ * Fold the foreign cpu events into our own.
+ *
+ * This is adding to the events on one processor
+ * but keeps the global counts constant.
+ */
+void vm_events_fold_cpu(int cpu)
+{
+ struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
+ int i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+ count_vm_events(i, fold_state->event[i]);
+ fold_state->event[i] = 0;
+ }
+}
+
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
+/*
+ * Manage combined zone based / global counters
+ *
+ * vm_stat contains the global counters
+ */
+atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_numa_stat);
+EXPORT_SYMBOL(vm_node_stat);
+
+#ifdef CONFIG_SMP
+
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
+{
+ int threshold;
+ int mem; /* memory in 128 MB units */
+
+ /*
+ * The threshold scales with the number of processors and the amount
+ * of memory per zone. More memory means that we can defer updates for
+ * longer, more processors could lead to more contention.
+ * fls() is used to have a cheap way of logarithmic scaling.
+ *
+ * Some sample thresholds:
+ *
+ * Threshold Processors (fls) Zonesize fls(mem+1)
+ * ------------------------------------------------------------------
+ * 8 1 1 0.9-1 GB 4
+ * 16 2 2 0.9-1 GB 4
+ * 20 2 2 1-2 GB 5
+ * 24 2 2 2-4 GB 6
+ * 28 2 2 4-8 GB 7
+ * 32 2 2 8-16 GB 8
+ * 4 2 2 <128M 1
+ * 30 4 3 2-4 GB 5
+ * 48 4 3 8-16 GB 8
+ * 32 8 4 1-2 GB 4
+ * 32 8 4 0.9-1GB 4
+ * 10 16 5 <128M 1
+ * 40 16 5 900M 4
+ * 70 64 7 2-4 GB 5
+ * 84 64 7 4-8 GB 6
+ * 108 512 9 4-8 GB 6
+ * 125 1024 10 8-16 GB 8
+ * 125 1024 10 16-32 GB 9
+ */
+
+ mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
+
+ threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+/*
+ * Refresh the thresholds for each zone.
+ */
+void refresh_zone_stat_thresholds(void)
+{
+ struct pglist_data *pgdat;
+ struct zone *zone;
+ int cpu;
+ int threshold;
+
+ /* Zero current pgdat thresholds */
+ for_each_online_pgdat(pgdat) {
+ for_each_online_cpu(cpu) {
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
+ }
+ }
+
+ for_each_populated_zone(zone) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long max_drift, tolerate_drift;
+
+ threshold = calculate_normal_threshold(zone);
+
+ for_each_online_cpu(cpu) {
+ int pgdat_threshold;
+
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+
+ /* Base nodestat threshold on the largest populated zone. */
+ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
+ = max(threshold, pgdat_threshold);
+ }
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
+ }
+}
+
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+}
+
+/*
+ * For use when we know that interrupts are disabled,
+ * or when we know that preemption is disabled and that
+ * particular counter cannot be updated from interrupt context.
+ */
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ long delta)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
+
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(abs(x) > t)) {
+ zone_page_state_add(x, zone, item);
+ x = 0;
+ }
+ __this_cpu_write(*p, x);
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ long x;
+ long t;
+
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
+ x = delta + __this_cpu_read(*p);
+
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(abs(x) > t)) {
+ node_page_state_add(x, pgdat, item);
+ x = 0;
+ }
+ __this_cpu_write(*p, x);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
+/*
+ * Optimized increment and decrement functions.
+ *
+ * These are only for a single page and therefore can take a struct page *
+ * argument instead of struct zone *. This allows the inclusion of the code
+ * generated for page_zone(page) into the optimized functions.
+ *
+ * No overflow check is necessary and therefore the differential can be
+ * incremented or decremented in place which may allow the compilers to
+ * generate better code.
+ * The increment or decrement is known and therefore one boundary check can
+ * be omitted.
+ *
+ * NOTE: These functions are very performance sensitive. Change only
+ * with care.
+ *
+ * Some processors have inc/dec instructions that are atomic vs an interrupt.
+ * However, the code must first determine the differential location in a zone
+ * based on the processor number and then inc/dec the counter. There is no
+ * guarantee without disabling preemption that the processor will not change
+ * in between and therefore the atomicity vs. interrupt cannot be exploited
+ * in a useful way here.
+ */
+void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
+
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
+
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
+ }
+}
+
+void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s8 v, t;
+
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
+
+ node_page_state_add(v + overstep, pgdat, item);
+ __this_cpu_write(*p, -overstep);
+ }
+}
+
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ __inc_zone_state(page_zone(page), item);
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ __inc_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
+void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
+
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
+
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
+ }
+}
+
+void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s8 v, t;
+
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
+
+ node_page_state_add(v - overstep, pgdat, item);
+ __this_cpu_write(*p, overstep);
+ }
+}
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ __dec_zone_state(page_zone(page), item);
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ __dec_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_zone_state(struct zone *zone,
+ enum zone_stat_item item, long delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (abs(n) > t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ long delta)
+{
+ mod_zone_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_zone_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+ enum node_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ long o, n, t, z;
+
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
+ do {
+ z = 0; /* overflow to node counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a node.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (abs(n) > t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to node counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ long delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ unsigned long flags;
+ struct zone *zone;
+
+ zone = page_zone(page);
+ local_irq_save(flags);
+ __inc_zone_state(zone, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_zone_page_state(page, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __inc_node_state(pgdat, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_state);
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_node_page_state(pgdat, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ unsigned long flags;
+ struct pglist_data *pgdat;
+
+ pgdat = page_pgdat(page);
+ local_irq_save(flags);
+ __inc_node_state(pgdat, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_node_page_state(page, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+#endif
+
+/*
+ * Fold a differential into the global counters.
+ * Returns the number of counters updated.
+ */
+#ifdef CONFIG_NUMA
+static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+{
+ int i;
+ int changes = 0;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (zone_diff[i]) {
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (numa_diff[i]) {
+ atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (node_diff[i]) {
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
+ changes++;
+ }
+ return changes;
+}
+#else
+static int fold_diff(int *zone_diff, int *node_diff)
+{
+ int i;
+ int changes = 0;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (zone_diff[i]) {
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (node_diff[i]) {
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
+ changes++;
+ }
+ return changes;
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * Update the zone counters for the current cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
+ *
+ * The function returns the number of global counters updated.
+ */
+static int refresh_cpu_vm_stats(bool do_pagesets)
+{
+ struct pglist_data *pgdat;
+ struct zone *zone;
+ int i;
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+ int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
+ int changes = 0;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset __percpu *p = zone->pageset;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ int v;
+
+ v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+ if (v) {
+
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_zone_diff[i] += v;
+#ifdef CONFIG_NUMA
+ /* 3 seconds idle till flush */
+ __this_cpu_write(p->expire, 3);
+#endif
+ }
+ }
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+ int v;
+
+ v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+ if (v) {
+
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ global_numa_diff[i] += v;
+ __this_cpu_write(p->expire, 3);
+ }
+ }
+
+ if (do_pagesets) {
+ cond_resched();
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
+ !__this_cpu_read(p->pcp.count))
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
+
+ if (__this_cpu_dec_return(p->expire))
+ continue;
+
+ if (__this_cpu_read(p->pcp.count)) {
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
+ }
+#endif
+ }
+
+ for_each_online_pgdat(pgdat) {
+ struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ int v;
+
+ v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
+ if (v) {
+ atomic_long_add(v, &pgdat->vm_stat[i]);
+ global_node_diff[i] += v;
+ }
+ }
+ }
+
+#ifdef CONFIG_NUMA
+ changes += fold_diff(global_zone_diff, global_numa_diff,
+ global_node_diff);
+#else
+ changes += fold_diff(global_zone_diff, global_node_diff);
+#endif
+ return changes;
+}
+
+/*
+ * Fold the data for an offline cpu into the global array.
+ * There cannot be any access by the offline cpu and therefore
+ * synchronization is simplified.
+ */
+void cpu_vm_stats_fold(int cpu)
+{
+ struct pglist_data *pgdat;
+ struct zone *zone;
+ int i;
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+ int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i]) {
+ int v;
+
+ v = p->vm_stat_diff[i];
+ p->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_zone_diff[i] += v;
+ }
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (p->vm_numa_stat_diff[i]) {
+ int v;
+
+ v = p->vm_numa_stat_diff[i];
+ p->vm_numa_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ global_numa_diff[i] += v;
+ }
+#endif
+ }
+
+ for_each_online_pgdat(pgdat) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (p->vm_node_stat_diff[i]) {
+ int v;
+
+ v = p->vm_node_stat_diff[i];
+ p->vm_node_stat_diff[i] = 0;
+ atomic_long_add(v, &pgdat->vm_stat[i]);
+ global_node_diff[i] += v;
+ }
+ }
+
+#ifdef CONFIG_NUMA
+ fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
+#else
+ fold_diff(global_zone_diff, global_node_diff);
+#endif
+}
+
+/*
+ * this is only called if !populated_zone(zone), which implies no other users of
+ * pset->vm_stat_diff[] exsist.
+ */
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+ int i;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (pset->vm_stat_diff[i]) {
+ int v = pset->vm_stat_diff[i];
+ pset->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ atomic_long_add(v, &vm_zone_stat[i]);
+ }
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ if (pset->vm_numa_stat_diff[i]) {
+ int v = pset->vm_numa_stat_diff[i];
+
+ pset->vm_numa_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_numa_stat[i]);
+ atomic_long_add(v, &vm_numa_stat[i]);
+ }
+#endif
+}
+#endif
+
+#ifdef CONFIG_NUMA
+void __inc_numa_state(struct zone *zone,
+ enum numa_stat_item item)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+ u16 v;
+
+ v = __this_cpu_inc_return(*p);
+
+ if (unlikely(v > NUMA_STATS_THRESHOLD)) {
+ zone_numa_state_add(v, zone, item);
+ __this_cpu_write(*p, 0);
+ }
+}
+
+/*
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
+ */
+unsigned long sum_zone_node_page_state(int node,
+ enum zone_stat_item item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state(zones + i, item);
+
+ return count;
+}
+
+/*
+ * Determine the per node value of a numa stat item. To avoid deviation,
+ * the per cpu stat number in vm_numa_stat_diff[] is also included.
+ */
+unsigned long sum_zone_numa_state(int node,
+ enum numa_stat_item item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_numa_state_snapshot(zones + i, item);
+
+ return count;
+}
+
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state_pages(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ long x = atomic_long_read(&pgdat->vm_stat[item]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
+unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ return node_page_state_pages(pgdat, item);
+}
+#endif
+
+#ifdef CONFIG_COMPACTION
+
+struct contig_page_info {
+ unsigned long free_pages;
+ unsigned long free_blocks_total;
+ unsigned long free_blocks_suitable;
+};
+
+/*
+ * Calculate the number of free pages in a zone, how many contiguous
+ * pages are free and how many are large enough to satisfy an allocation of
+ * the target size. Note that this function makes no attempt to estimate
+ * how many suitable free blocks there *might* be if MOVABLE pages were
+ * migrated. Calculating that is possible, but expensive and can be
+ * figured out from userspace
+ */
+static void fill_contig_page_info(struct zone *zone,
+ unsigned int suitable_order,
+ struct contig_page_info *info)
+{
+ unsigned int order;
+
+ info->free_pages = 0;
+ info->free_blocks_total = 0;
+ info->free_blocks_suitable = 0;
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long blocks;
+
+ /* Count number of free blocks */
+ blocks = zone->free_area[order].nr_free;
+ info->free_blocks_total += blocks;
+
+ /* Count free base pages */
+ info->free_pages += blocks << order;
+
+ /* Count the suitable free blocks */
+ if (order >= suitable_order)
+ info->free_blocks_suitable += blocks <<
+ (order - suitable_order);
+ }
+}
+
+/*
+ * A fragmentation index only makes sense if an allocation of a requested
+ * size would fail. If that is true, the fragmentation index indicates
+ * whether external fragmentation or a lack of memory was the problem.
+ * The value can be used to determine if page reclaim or compaction
+ * should be used
+ */
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
+{
+ unsigned long requested = 1UL << order;
+
+ if (WARN_ON_ONCE(order >= MAX_ORDER))
+ return 0;
+
+ if (!info->free_blocks_total)
+ return 0;
+
+ /* Fragmentation index only makes sense when a request would fail */
+ if (info->free_blocks_suitable)
+ return -1000;
+
+ /*
+ * Index is between 0 and 1 so return within 3 decimal places
+ *
+ * 0 => allocation would fail due to lack of memory
+ * 1 => allocation would fail due to fragmentation
+ */
+ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
+}
+
+/*
+ * Calculates external fragmentation within a zone wrt the given order.
+ * It is defined as the percentage of pages found in blocks of size
+ * less than 1 << order. It returns values in range [0, 100].
+ */
+unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ if (info.free_pages == 0)
+ return 0;
+
+ return div_u64((info.free_pages -
+ (info.free_blocks_suitable << order)) * 100,
+ info.free_pages);
+}
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ return __fragmentation_index(order, &info);
+}
+#endif
+
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
+ defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
+
+const char * const vmstat_text[] = {
+ /* enum zone_stat_item counters */
+ "nr_free_pages",
+ "nr_zone_inactive_anon",
+ "nr_zone_active_anon",
+ "nr_zone_inactive_file",
+ "nr_zone_active_file",
+ "nr_zone_unevictable",
+ "nr_zone_write_pending",
+ "nr_mlock",
+ "nr_page_table_pages",
+ "nr_bounce",
+#if IS_ENABLED(CONFIG_ZSMALLOC)
+ "nr_zspages",
+#endif
+ "nr_free_cma",
+
+ /* enum numa_stat_item counters */
+#ifdef CONFIG_NUMA
+ "numa_hit",
+ "numa_miss",
+ "numa_foreign",
+ "numa_interleave",
+ "numa_local",
+ "numa_other",
+#endif
+
+ /* enum node_stat_item counters */
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+ "nr_unevictable",
+ "nr_slab_reclaimable",
+ "nr_slab_unreclaimable",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "workingset_nodes",
+ "workingset_refault_anon",
+ "workingset_refault_file",
+ "workingset_activate_anon",
+ "workingset_activate_file",
+ "workingset_restore_anon",
+ "workingset_restore_file",
+ "workingset_nodereclaim",
+ "nr_anon_pages",
+ "nr_mapped",
+ "nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
+ "nr_writeback_temp",
+ "nr_shmem",
+ "nr_shmem_hugepages",
+ "nr_shmem_pmdmapped",
+ "nr_file_hugepages",
+ "nr_file_pmdmapped",
+ "nr_anon_transparent_hugepages",
+ "nr_vmscan_write",
+ "nr_vmscan_immediate_reclaim",
+ "nr_dirtied",
+ "nr_written",
+ "nr_kernel_misc_reclaimable",
+ "nr_foll_pin_acquired",
+ "nr_foll_pin_released",
+ "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
+
+ /* enum writeback_stat_item counters */
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
+
+#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+ /* enum vm_event_item counters */
+ "pgpgin",
+ "pgpgout",
+ "pswpin",
+ "pswpout",
+
+ TEXTS_FOR_ZONES("pgalloc")
+ TEXTS_FOR_ZONES("allocstall")
+ TEXTS_FOR_ZONES("pgskip")
+
+ "pgfree",
+ "pgactivate",
+ "pgdeactivate",
+ "pglazyfree",
+
+ "pgfault",
+ "pgmajfault",
+ "pglazyfreed",
+
+ "pgrefill",
+ "pgreuse",
+ "pgsteal_kswapd",
+ "pgsteal_direct",
+ "pgscan_kswapd",
+ "pgscan_direct",
+ "pgscan_direct_throttle",
+ "pgscan_anon",
+ "pgscan_file",
+ "pgsteal_anon",
+ "pgsteal_file",
+
+#ifdef CONFIG_NUMA
+ "zone_reclaim_failed",
+#endif
+ "pginodesteal",
+ "slabs_scanned",
+ "kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
+ "pageoutrun",
+
+ "pgrotated",
+
+ "drop_pagecache",
+ "drop_slab",
+ "oom_kill",
+
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_huge_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+ "thp_migration_success",
+ "thp_migration_fail",
+ "thp_migration_split",
+#endif
+#ifdef CONFIG_COMPACTION
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+ "compact_daemon_wake",
+ "compact_daemon_migrate_scanned",
+ "compact_daemon_free_scanned",
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
+ "unevictable_pgs_culled",
+ "unevictable_pgs_scanned",
+ "unevictable_pgs_rescued",
+ "unevictable_pgs_mlocked",
+ "unevictable_pgs_munlocked",
+ "unevictable_pgs_cleared",
+ "unevictable_pgs_stranded",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_fault_fallback_charge",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_file_alloc",
+ "thp_file_fallback",
+ "thp_file_fallback_charge",
+ "thp_file_mapped",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_deferred_split_page",
+ "thp_split_pmd",
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ "thp_split_pud",
+#endif
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
+ "thp_swpout",
+ "thp_swpout_fallback",
+#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ "balloon_inflate",
+ "balloon_deflate",
+#ifdef CONFIG_BALLOON_COMPACTION
+ "balloon_migrate",
+#endif
+#endif /* CONFIG_MEMORY_BALLOON */
+#ifdef CONFIG_DEBUG_TLBFLUSH
+ "nr_tlb_remote_flush",
+ "nr_tlb_remote_flush_received",
+ "nr_tlb_local_flush_all",
+ "nr_tlb_local_flush_one",
+#endif /* CONFIG_DEBUG_TLBFLUSH */
+
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ "vmacache_find_calls",
+ "vmacache_find_hits",
+#endif
+#ifdef CONFIG_SWAP
+ "swap_ra",
+ "swap_ra_hit",
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
+
+#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
+ defined(CONFIG_PROC_FS)
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+ pg_data_t *pgdat;
+ loff_t node = *pos;
+
+ for (pgdat = first_online_pgdat();
+ pgdat && node;
+ pgdat = next_online_pgdat(pgdat))
+ --node;
+
+ return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ (*pos)++;
+ return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool assert_populated, bool nolock,
+ void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (assert_populated && !populated_zone(zone))
+ continue;
+
+ if (!nolock)
+ spin_lock_irqsave(&zone->lock, flags);
+ print(m, pgdat, zone);
+ if (!nolock)
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int order;
+
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_putc(m, '\n');
+}
+
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, true, false, frag_show_print);
+ return 0;
+}
+
+static void pagetypeinfo_showfree_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int order, mtype;
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+ seq_printf(m, "Node %4d, zone %8s, type %12s ",
+ pgdat->node_id,
+ zone->name,
+ migratetype_names[mtype]);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ unsigned long freecount = 0;
+ struct free_area *area;
+ struct list_head *curr;
+ bool overflow = false;
+
+ area = &(zone->free_area[order]);
+
+ list_for_each(curr, &area->free_list[mtype]) {
+ /*
+ * Cap the free_list iteration because it might
+ * be really large and we are under a spinlock
+ * so a long time spent here could trigger a
+ * hard lockup detector. Anyway this is a
+ * debugging tool so knowing there is a handful
+ * of pages of this order should be more than
+ * sufficient.
+ */
+ if (++freecount >= 100000) {
+ overflow = true;
+ break;
+ }
+ }
+ seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
+ }
+ seq_putc(m, '\n');
+ }
+}
+
+/* Print out the free pages at each order for each migatetype */
+static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+{
+ int order;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* Print header */
+ seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6d ", order);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
+
+ return 0;
+}
+
+static void pagetypeinfo_showblockcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int mtype;
+ unsigned long pfn;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ continue;
+
+ if (page_zone(page) != zone)
+ continue;
+
+ mtype = get_pageblock_migratetype(page);
+
+ if (mtype < MIGRATE_TYPES)
+ count[mtype]++;
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12lu ", count[mtype]);
+ seq_putc(m, '\n');
+}
+
+/* Print out the number of pageblocks for each migratetype */
+static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+{
+ int mtype;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ seq_printf(m, "\n%-23s", "Number of blocks type ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+ walk_zones_in_node(m, pgdat, true, false,
+ pagetypeinfo_showblockcount_print);
+
+ return 0;
+}
+
+/*
+ * Print out the number of pageblocks for each migratetype that contain pages
+ * of other types. This gives an indication of how well fallbacks are being
+ * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
+ * to determine what is going on
+ */
+static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
+{
+#ifdef CONFIG_PAGE_OWNER
+ int mtype;
+
+ if (!static_branch_unlikely(&page_owner_inited))
+ return;
+
+ drain_all_pages(NULL);
+
+ seq_printf(m, "\n%-23s", "Number of mixed blocks ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, true, true,
+ pagetypeinfo_showmixedcount_print);
+#endif /* CONFIG_PAGE_OWNER */
+}
+
+/*
+ * This prints out statistics in relation to grouping pages by mobility.
+ * It is expensive to collect so do not constantly read the file.
+ */
+static int pagetypeinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_MEMORY))
+ return 0;
+
+ seq_printf(m, "Page block order: %d\n", pageblock_order);
+ seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
+ seq_putc(m, '\n');
+ pagetypeinfo_showfree(m, pgdat);
+ pagetypeinfo_showblockcount(m, pgdat);
+ pagetypeinfo_showmixedcount(m, pgdat);
+
+ return 0;
+}
+
+static const struct seq_operations fragmentation_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = frag_show,
+};
+
+static const struct seq_operations pagetypeinfo_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = pagetypeinfo_show,
+};
+
+static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
+{
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *compare = &pgdat->node_zones[zid];
+
+ if (populated_zone(compare))
+ return zone == compare;
+ }
+
+ return false;
+}
+
+static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int i;
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ if (is_zone_first_populated(pgdat, zone)) {
+ seq_printf(m, "\n per-node stats");
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ seq_printf(m, "\n %-12s %lu", node_stat_name(i),
+ node_page_state_pages(pgdat, i));
+ }
+ }
+ seq_printf(m,
+ "\n pages free %lu"
+ "\n min %lu"
+ "\n low %lu"
+ "\n high %lu"
+ "\n spanned %lu"
+ "\n present %lu"
+ "\n managed %lu",
+ zone_page_state(zone, NR_FREE_PAGES),
+ min_wmark_pages(zone),
+ low_wmark_pages(zone),
+ high_wmark_pages(zone),
+ zone->spanned_pages,
+ zone->present_pages,
+ zone_managed_pages(zone));
+
+ seq_printf(m,
+ "\n protection: (%ld",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
+ seq_putc(m, ')');
+
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
+ zone_page_state(zone, i));
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
+ zone_numa_state_snapshot(zone, i));
+#endif
+
+ seq_printf(m, "\n pagesets");
+ for_each_online_cpu(i) {
+ struct per_cpu_pageset *pageset;
+
+ pageset = per_cpu_ptr(zone->pageset, i);
+ seq_printf(m,
+ "\n cpu: %i"
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i,
+ pageset->pcp.count,
+ pageset->pcp.high,
+ pageset->pcp.batch);
+#ifdef CONFIG_SMP
+ seq_printf(m, "\n vm stats threshold: %d",
+ pageset->stat_threshold);
+#endif
+ }
+ seq_printf(m,
+ "\n node_unreclaimable: %u"
+ "\n start_pfn: %lu",
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
+ zone->zone_start_pfn);
+ seq_putc(m, '\n');
+}
+
+/*
+ * Output information about zones in @pgdat. All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
+ return 0;
+}
+
+static const struct seq_operations zoneinfo_op = {
+ .start = frag_start, /* iterate over all zones. The same as in
+ * fragmentation. */
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = zoneinfo_show,
+};
+
+#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
+ NR_VM_NUMA_STAT_ITEMS + \
+ NR_VM_NODE_STAT_ITEMS + \
+ NR_VM_WRITEBACK_STAT_ITEMS + \
+ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
+ NR_VM_EVENT_ITEMS : 0))
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+ unsigned long *v;
+ int i;
+
+ if (*pos >= NR_VMSTAT_ITEMS)
+ return NULL;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+ v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
+ m->private = v;
+ if (!v)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ v[i] = global_zone_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+ v[i] = global_numa_state(i);
+ v += NR_VM_NUMA_STAT_ITEMS;
+#endif
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ v[i] = global_node_page_state_pages(i);
+ v += NR_VM_NODE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
+#endif
+ return (unsigned long *)m->private + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= NR_VMSTAT_ITEMS)
+ return NULL;
+ return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+ unsigned long *l = arg;
+ unsigned long off = l - (unsigned long *)m->private;
+
+ seq_puts(m, vmstat_text[off]);
+ seq_put_decimal_ull(m, " ", *l);
+ seq_putc(m, '\n');
+
+ if (off == NR_VMSTAT_ITEMS - 1) {
+ /*
+ * We've come to the end - add any deprecated counters to avoid
+ * breaking userspace which might depend on them being present.
+ */
+ seq_puts(m, "nr_unstable 0\n");
+ }
+ return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+ kfree(m->private);
+ m->private = NULL;
+}
+
+static const struct seq_operations vmstat_op = {
+ .start = vmstat_start,
+ .next = vmstat_next,
+ .stop = vmstat_stop,
+ .show = vmstat_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+ refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ long val;
+ int err;
+ int i;
+
+ /*
+ * The regular update, every sysctl_stat_interval, may come later
+ * than expected: leaving a significant amount in per_cpu buckets.
+ * This is particularly misleading when checking a quantity of HUGE
+ * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
+ * which can equally be echo'ed to or cat'ted from (by root),
+ * can be used to update the stats just before reading them.
+ *
+ * Oh, and since global_zone_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative, so we know to go looking for imbalance.
+ */
+ err = schedule_on_each_cpu(refresh_vm_stats);
+ if (err)
+ return err;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_zone_stat[i]);
+ if (val < 0) {
+ pr_warn("%s: %s %ld\n",
+ __func__, zone_stat_name(i), val);
+ err = -EINVAL;
+ }
+ }
+#ifdef CONFIG_NUMA
+ for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_numa_stat[i]);
+ if (val < 0) {
+ pr_warn("%s: %s %ld\n",
+ __func__, numa_stat_name(i), val);
+ err = -EINVAL;
+ }
+ }
+#endif
+ if (err)
+ return err;
+ if (write)
+ *ppos += *lenp;
+ else
+ *lenp = 0;
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static void vmstat_update(struct work_struct *w)
+{
+ if (refresh_cpu_vm_stats(true)) {
+ /*
+ * Counters were updated so we expect more updates
+ * to occur in the future. Keep on running the
+ * update worker thread.
+ */
+ queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ }
+}
+
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static bool need_update(int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+
+ BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+#ifdef CONFIG_NUMA
+ BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
+#endif
+
+ /*
+ * The fast way of checking if there are any vmstat diffs.
+ */
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
+ return true;
+#ifdef CONFIG_NUMA
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
+ return true;
+#endif
+ }
+ return false;
+}
+
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
+ return;
+
+ if (!need_update(smp_processor_id()))
+ return;
+
+ /*
+ * Just refresh counters and do not care about the pending delayed
+ * vmstat_update. It doesn't fire that often to matter and canceling
+ * it would be too expensive from this path.
+ * vmstat_shepherd will take care about that for us.
+ */
+ refresh_cpu_vm_stats(false);
+}
+
+/*
+ * Shepherd worker thread that checks the
+ * differentials of processors that have their worker
+ * threads for vm statistics updates disabled because of
+ * inactivity.
+ */
+static void vmstat_shepherd(struct work_struct *w);
+
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
+
+static void vmstat_shepherd(struct work_struct *w)
+{
+ int cpu;
+
+ get_online_cpus();
+ /* Check processors whose vmstat worker threads have been disabled */
+ for_each_online_cpu(cpu) {
+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+
+ if (!delayed_work_pending(dw) && need_update(cpu))
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+ }
+ put_online_cpus();
+
+ schedule_delayed_work(&shepherd,
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+static void __init start_shepherd_timer(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ vmstat_update);
+
+ schedule_delayed_work(&shepherd,
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+static void __init init_cpu_node_state(void)
+{
+ int node;
+
+ for_each_online_node(node) {
+ if (cpumask_weight(cpumask_of_node(node)) > 0)
+ node_set_state(node, N_CPU);
+ }
+}
+
+static int vmstat_cpu_online(unsigned int cpu)
+{
+ refresh_zone_stat_thresholds();
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ return 0;
+}
+
+static int vmstat_cpu_down_prep(unsigned int cpu)
+{
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ return 0;
+}
+
+static int vmstat_cpu_dead(unsigned int cpu)
+{
+ const struct cpumask *node_cpus;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ refresh_zone_stat_thresholds();
+ node_cpus = cpumask_of_node(node);
+ if (cpumask_weight(node_cpus) > 0)
+ return 0;
+
+ node_clear_state(node, N_CPU);
+ return 0;
+}
+
+#endif
+
+struct workqueue_struct *mm_percpu_wq;
+
+void __init init_mm_internals(void)
+{
+ int ret __maybe_unused;
+
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
+
+#ifdef CONFIG_SMP
+ ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
+ NULL, vmstat_cpu_dead);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'dead' hotplug state\n");
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
+ vmstat_cpu_online,
+ vmstat_cpu_down_prep);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'online' hotplug state\n");
+
+ get_online_cpus();
+ init_cpu_node_state();
+ put_online_cpus();
+
+ start_shepherd_timer();
+#endif
+#ifdef CONFIG_PROC_FS
+ proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
+ proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
+ proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+#endif
+}
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
+
+/*
+ * Return an index indicating how much of the available free memory is
+ * unusable for an allocation of the requested size.
+ */
+static int unusable_free_index(unsigned int order,
+ struct contig_page_info *info)
+{
+ /* No free memory is interpreted as all free memory is unusable */
+ if (info->free_pages == 0)
+ return 1000;
+
+ /*
+ * Index should be a value between 0 and 1. Return a value to 3
+ * decimal places.
+ *
+ * 0 => no fragmentation
+ * 1 => high fragmentation
+ */
+ return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
+
+}
+
+static void unusable_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = unusable_free_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display unusable free space index
+ *
+ * The unusable free space index measures how much of the available free
+ * memory cannot be used to satisfy an allocation of a given size and is a
+ * value between 0 and 1. The higher the value, the more of free memory is
+ * unusable and by implication, the worse the external fragmentation is. This
+ * can be expressed as a percentage by multiplying by 100.
+ */
+static int unusable_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_MEMORY))
+ return 0;
+
+ walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations unusable_sops = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = unusable_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(unusable);
+
+static void extfrag_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+
+ /* Alloc on stack as interrupts are disabled for zone walk */
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = __fragmentation_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display fragmentation index for orders that allocations would fail for
+ */
+static int extfrag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations extfrag_sops = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = extfrag_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(extfrag);
+
+static int __init extfrag_debug_init(void)
+{
+ struct dentry *extfrag_debug_root;
+
+ extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
+
+ debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
+ &unusable_fops);
+
+ debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
+ &extfrag_fops);
+
+ return 0;
+}
+
+module_init(extfrag_debug_init);
+#endif
diff --git a/mm/workingset.c b/mm/workingset.c
new file mode 100644
index 000000000..975a4d2dd
--- /dev/null
+++ b/mm/workingset.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Workingset detection
+ *
+ * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/writeback.h>
+#include <linux/shmem_fs.h>
+#include <linux/pagemap.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+/*
+ * Double CLOCK lists
+ *
+ * Per node, two clock lists are maintained for file pages: the
+ * inactive and the active list. Freshly faulted pages start out at
+ * the head of the inactive list and page reclaim scans pages from the
+ * tail. Pages that are accessed multiple times on the inactive list
+ * are promoted to the active list, to protect them from reclaim,
+ * whereas active pages are demoted to the inactive list when the
+ * active list grows too big.
+ *
+ * fault ------------------------+
+ * |
+ * +--------------+ | +-------------+
+ * reclaim <- | inactive | <-+-- demotion | active | <--+
+ * +--------------+ +-------------+ |
+ * | |
+ * +-------------- promotion ------------------+
+ *
+ *
+ * Access frequency and refault distance
+ *
+ * A workload is thrashing when its pages are frequently used but they
+ * are evicted from the inactive list every time before another access
+ * would have promoted them to the active list.
+ *
+ * In cases where the average access distance between thrashing pages
+ * is bigger than the size of memory there is nothing that can be
+ * done - the thrashing set could never fit into memory under any
+ * circumstance.
+ *
+ * However, the average access distance could be bigger than the
+ * inactive list, yet smaller than the size of memory. In this case,
+ * the set could fit into memory if it weren't for the currently
+ * active pages - which may be used more, hopefully less frequently:
+ *
+ * +-memory available to cache-+
+ * | |
+ * +-inactive------+-active----+
+ * a b | c d e f g h i | J K L M N |
+ * +---------------+-----------+
+ *
+ * It is prohibitively expensive to accurately track access frequency
+ * of pages. But a reasonable approximation can be made to measure
+ * thrashing on the inactive list, after which refaulting pages can be
+ * activated optimistically to compete with the existing active pages.
+ *
+ * Approximating inactive page access frequency - Observations:
+ *
+ * 1. When a page is accessed for the first time, it is added to the
+ * head of the inactive list, slides every existing inactive page
+ * towards the tail by one slot, and pushes the current tail page
+ * out of memory.
+ *
+ * 2. When a page is accessed for the second time, it is promoted to
+ * the active list, shrinking the inactive list by one slot. This
+ * also slides all inactive pages that were faulted into the cache
+ * more recently than the activated page towards the tail of the
+ * inactive list.
+ *
+ * Thus:
+ *
+ * 1. The sum of evictions and activations between any two points in
+ * time indicate the minimum number of inactive pages accessed in
+ * between.
+ *
+ * 2. Moving one inactive page N page slots towards the tail of the
+ * list requires at least N inactive page accesses.
+ *
+ * Combining these:
+ *
+ * 1. When a page is finally evicted from memory, the number of
+ * inactive pages accessed while the page was in cache is at least
+ * the number of page slots on the inactive list.
+ *
+ * 2. In addition, measuring the sum of evictions and activations (E)
+ * at the time of a page's eviction, and comparing it to another
+ * reading (R) at the time the page faults back into memory tells
+ * the minimum number of accesses while the page was not cached.
+ * This is called the refault distance.
+ *
+ * Because the first access of the page was the fault and the second
+ * access the refault, we combine the in-cache distance with the
+ * out-of-cache distance to get the complete minimum access distance
+ * of this page:
+ *
+ * NR_inactive + (R - E)
+ *
+ * And knowing the minimum access distance of a page, we can easily
+ * tell if the page would be able to stay in cache assuming all page
+ * slots in the cache were available:
+ *
+ * NR_inactive + (R - E) <= NR_inactive + NR_active
+ *
+ * which can be further simplified to
+ *
+ * (R - E) <= NR_active
+ *
+ * Put into words, the refault distance (out-of-cache) can be seen as
+ * a deficit in inactive list space (in-cache). If the inactive list
+ * had (R - E) more page slots, the page would not have been evicted
+ * in between accesses, but activated instead. And on a full system,
+ * the only thing eating into inactive list space is active pages.
+ *
+ *
+ * Refaulting inactive pages
+ *
+ * All that is known about the active list is that the pages have been
+ * accessed more than once in the past. This means that at any given
+ * time there is actually a good chance that pages on the active list
+ * are no longer in active use.
+ *
+ * So when a refault distance of (R - E) is observed and there are at
+ * least (R - E) active pages, the refaulting page is activated
+ * optimistically in the hope that (R - E) active pages are actually
+ * used less frequently than the refaulting page - or even not used at
+ * all anymore.
+ *
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
+ * If this is wrong and demotion kicks in, the pages which are truly
+ * used more frequently will be reactivated while the less frequently
+ * used once will be evicted from memory.
+ *
+ * But if this is right, the stale pages will be pushed out of memory
+ * and the used pages get to stay in cache.
+ *
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
+ *
+ * Implementation
+ *
+ * For each node's LRU lists, a counter for inactive evictions and
+ * activations is maintained (node->nonresident_age).
+ *
+ * On eviction, a snapshot of this counter (along with some bits to
+ * identify the node) is stored in the now empty page cache
+ * slot of the evicted page. This is called a shadow entry.
+ *
+ * On cache misses for which there are shadow entries, an eligible
+ * refault distance will immediately activate the refaulting page.
+ */
+
+#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the xarray
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
+{
+ eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+ eviction = (eviction << 1) | workingset;
+
+ return xa_mk_value(eviction);
+}
+
+static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+ unsigned long *evictionp, bool *workingsetp)
+{
+ unsigned long entry = xa_to_value(shadow);
+ int memcgid, nid;
+ bool workingset;
+
+ workingset = entry & 1;
+ entry >>= 1;
+ nid = entry & ((1UL << NODES_SHIFT) - 1);
+ entry >>= NODES_SHIFT;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
+
+ *memcgidp = memcgid;
+ *pgdat = NODE_DATA(nid);
+ *evictionp = entry << bucket_order;
+ *workingsetp = workingset;
+}
+
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
+{
+ /*
+ * Reclaiming a cgroup means reclaiming all its children in a
+ * round-robin fashion. That means that each cgroup has an LRU
+ * order that is composed of the LRU orders of its child
+ * cgroups; and every page has an LRU position not just in the
+ * cgroup that owns it, but in all of that group's ancestors.
+ *
+ * So when the physical inactive list of a leaf cgroup ages,
+ * the virtual inactive lists of all its parents, including
+ * the root cgroup's, age as well.
+ */
+ do {
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+
+/**
+ * workingset_eviction - note the eviction of a page from memory
+ * @target_memcg: the cgroup that is causing the reclaim
+ * @page: the page being evicted
+ *
+ * Returns a shadow entry to be stored in @page->mapping->i_pages in place
+ * of the evicted @page so that a later refault can be detected.
+ */
+void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ int memcgid;
+
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->nonresident_age);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+}
+
+/**
+ * workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
+ * @shadow: shadow entry of the evicted page
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted page in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct page *page, void *shadow)
+{
+ bool file = page_is_file_lru(page);
+ struct mem_cgroup *eviction_memcg;
+ struct lruvec *eviction_lruvec;
+ unsigned long refault_distance;
+ unsigned long workingset_size;
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
+ bool workingset;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ eviction_memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !eviction_memcg)
+ goto out;
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
+
+ /*
+ * Calculate the refault distance
+ *
+ * The unsigned subtraction here gives an accurate distance
+ * across nonresident_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * nonresident_age to lap a shadow entry in the field, which
+ * can then result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
+
+ /*
+ * The activation decision for this page is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during page reclaim is being determined.
+ *
+ * However, the cgroup that will own the page is the one that
+ * is actually experiencing the refault event.
+ */
+ memcg = page_memcg(page);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
+
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having swap.
+ */
+ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ if (!file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_FILE);
+ }
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_ACTIVE_ANON);
+ if (file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ }
+ }
+ if (refault_distance > workingset_size)
+ goto out;
+
+ SetPageActive(page);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
+ }
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * workingset_activation - note a page activation
+ * @page: page that is being activated
+ */
+void workingset_activation(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ memcg = page_memcg_rcu(page);
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+out:
+ rcu_read_unlock();
+}
+
+/*
+ * Shadow entries reflect the share of the working set that does not
+ * fit into memory, so their number depends on the access pattern of
+ * the workload. In most cases, they will refault or get reclaimed
+ * along with the inode, but a (malicious) workload that streams
+ * through files with a total size several times that of available
+ * memory, while preventing the inodes from being reclaimed, can
+ * create excessive amounts of shadow nodes. To keep a lid on this,
+ * track shadow nodes and reclaim them when they grow way past the
+ * point where they would still be useful.
+ */
+
+static struct list_lru shadow_nodes;
+
+void workingset_update_node(struct xa_node *node)
+{
+ /*
+ * Track non-empty nodes that contain only shadow entries;
+ * unlink those that contain pages or are being freed.
+ *
+ * Avoid acquiring the list_lru lock when the nodes are
+ * already where they should be. The list_empty() test is safe
+ * as node->private_list is protected by the i_pages lock.
+ */
+ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+
+ if (node->count && node->count == node->nr_values) {
+ if (list_empty(&node->private_list)) {
+ list_lru_add(&shadow_nodes, &node->private_list);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODES);
+ }
+ } else {
+ if (!list_empty(&node->private_list)) {
+ list_lru_del(&shadow_nodes, &node->private_list);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ }
+ }
+}
+
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long max_nodes;
+ unsigned long nodes;
+ unsigned long pages;
+
+ nodes = list_lru_shrink_count(&shadow_nodes, sc);
+
+ /*
+ * Approximate a reasonable limit for the nodes
+ * containing shadow entries. We don't need to keep more
+ * shadow entries than possible pages on the active list,
+ * since refault distances bigger than that are dismissed.
+ *
+ * The size of the active list converges toward 100% of
+ * overall page cache as memory grows, with only a tiny
+ * inactive list. Assume the total cache size for that.
+ *
+ * Nodes might be sparsely populated, with only one shadow
+ * entry in the extreme case. Obviously, we cannot keep one
+ * node for every eligible shadow entry, so compromise on a
+ * worst-case density of 1/8th. Below that, not all eligible
+ * refaults can be detected anymore.
+ *
+ * On 64-bit with 7 xa_nodes per page and 64 slots
+ * each, this will reclaim shadow entries when they consume
+ * ~1.8% of available memory:
+ *
+ * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
+ */
+#ifdef CONFIG_MEMCG
+ if (sc->memcg) {
+ struct lruvec *lruvec;
+ int i;
+
+ lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state_local(lruvec,
+ NR_LRU_BASE + i);
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
+ } else
+#endif
+ pages = node_present_pages(sc->nid);
+
+ max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
+
+ if (!nodes)
+ return SHRINK_EMPTY;
+
+ if (nodes <= max_nodes)
+ return 0;
+ return nodes - max_nodes;
+}
+
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru,
+ spinlock_t *lru_lock,
+ void *arg) __must_hold(lru_lock)
+{
+ struct xa_node *node = container_of(item, struct xa_node, private_list);
+ struct address_space *mapping;
+ int ret;
+
+ /*
+ * Page cache insertions and deletions synchronously maintain
+ * the shadow node LRU under the i_pages lock and the
+ * lru_lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the lru_lock pins any
+ * address_space that has nodes on the LRU.
+ *
+ * We can then safely transition to the i_pages lock to
+ * pin only the address_space of the particular node we want
+ * to reclaim, take the node off-LRU, and drop the lru_lock.
+ */
+
+ mapping = container_of(node->array, struct address_space, i_pages);
+
+ /* Coming from the list, invert the lock order */
+ if (!xa_trylock(&mapping->i_pages)) {
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
+ list_lru_isolate(lru, item);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+
+ spin_unlock(lru_lock);
+
+ /*
+ * The nodes should only contain one or more shadow entries,
+ * no pages, so we expect to be able to remove them all and
+ * delete and free the empty node afterwards.
+ */
+ if (WARN_ON_ONCE(!node->nr_values))
+ goto out_invalid;
+ if (WARN_ON_ONCE(node->count != node->nr_values))
+ goto out_invalid;
+ mapping->nrexceptional -= node->nr_values;
+ xa_delete_node(node, workingset_update_node);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
+
+out_invalid:
+ xa_unlock_irq(&mapping->i_pages);
+ ret = LRU_REMOVED_RETRY;
+out:
+ cond_resched();
+ spin_lock_irq(lru_lock);
+ return ret;
+}
+
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ /* list_lru lock nests inside the IRQ-safe i_pages lock */
+ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
+ NULL);
+}
+
+static struct shrinker workingset_shadow_shrinker = {
+ .count_objects = count_shadow_nodes,
+ .scan_objects = scan_shadow_nodes,
+ .seeks = 0, /* ->count reports only fully expendable nodes */
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
+};
+
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * i_pages lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
+static int __init workingset_init(void)
+{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
+ int ret;
+
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages() - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
+ ret = prealloc_shrinker(&workingset_shadow_shrinker);
+ if (ret)
+ goto err;
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
+ &workingset_shadow_shrinker);
+ if (ret)
+ goto err_list_lru;
+ register_shrinker_prepared(&workingset_shadow_shrinker);
+ return 0;
+err_list_lru:
+ free_prealloced_shrinker(&workingset_shadow_shrinker);
+err:
+ return ret;
+}
+module_init(workingset_init);
diff --git a/mm/z3fold.c b/mm/z3fold.c
new file mode 100644
index 000000000..912ac9a64
--- /dev/null
+++ b/mm/z3fold.c
@@ -0,0 +1,1832 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * z3fold.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2016, Sony Mobile Communications Inc.
+ *
+ * This implementation is based on zbud written by Seth Jennings.
+ *
+ * z3fold is an special purpose allocator for storing compressed pages. It
+ * can store up to three compressed pages per page which improves the
+ * compression ratio of zbud while retaining its main concepts (e. g. always
+ * storing an integral number of objects per page) and simplicity.
+ * It still has simple and deterministic reclaim properties that make it
+ * preferable to a higher density approach (with no requirement on integral
+ * number of object per page) when reclaim is used.
+ *
+ * As in zbud, pages are divided into "chunks". The size of the chunks is
+ * fixed at compile time and is determined by NCHUNKS_ORDER below.
+ *
+ * z3fold doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
+#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/fs.h>
+#include <linux/preempt.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zpool.h>
+#include <linux/magic.h>
+#include <linux/kmemleak.h>
+
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
+#define SLOTS_ALIGN (0x40)
+
+/*****************
+ * Structures
+*****************/
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+ /*
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+ * be enough slots to hold all possible variants
+ */
+ unsigned long slot[BUDDY_MASK + 1];
+ unsigned long pool; /* back link */
+ rwlock_t lock;
+};
+#define HANDLE_FLAG_MASK (0x03)
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the
+ * pool
+ * @page_lock: per-page lock
+ * @refcount: reference count for the z3fold page
+ * @work: work_struct for page layout optimization
+ * @slots: pointer to the structure holding buddy slots
+ * @pool: pointer to the containing pool
+ * @cpu: CPU which this page "belongs" to
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ * @mapped_count: the number of objects currently mapped
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ spinlock_t page_lock;
+ struct kref refcount;
+ struct work_struct work;
+ struct z3fold_buddy_slots *slots;
+ struct z3fold_pool *pool;
+ short cpu;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:2;
+ unsigned short mapped_count:2;
+ unsigned short foreign_handles:2;
+};
+
+/**
+ * struct z3fold_pool - stores metadata for each z3fold pool
+ * @name: pool name
+ * @lock: protects pool unbuddied/lru lists
+ * @stale_lock: protects pool stale page list
+ * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
+ * buddies; the list each z3fold page is added to depends on
+ * the size of its free region.
+ * @lru: list tracking the z3fold pages in LRU order by most recently
+ * added buddy.
+ * @stale: list of pages marked for freeing
+ * @pages_nr: number of z3fold pages in the pool.
+ * @c_handle: cache for z3fold_buddy_slots allocation
+ * @ops: pointer to a structure of user defined operations specified at
+ * pool creation time.
+ * @compact_wq: workqueue for page layout background optimization
+ * @release_wq: workqueue for safe page release
+ * @work: work_struct for safe page release
+ * @inode: inode for z3fold pseudo filesystem
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular z3fold pool.
+ */
+struct z3fold_pool {
+ const char *name;
+ spinlock_t lock;
+ spinlock_t stale_lock;
+ struct list_head *unbuddied;
+ struct list_head lru;
+ struct list_head stale;
+ atomic64_t pages_nr;
+ struct kmem_cache *c_handle;
+ const struct z3fold_ops *ops;
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+ struct workqueue_struct *compact_wq;
+ struct workqueue_struct *release_wq;
+ struct work_struct work;
+ struct inode *inode;
+};
+
+/*
+ * Internal z3fold page flags
+ */
+enum z3fold_page_flags {
+ PAGE_HEADLESS = 0,
+ MIDDLE_CHUNK_MAPPED,
+ NEEDS_COMPACTING,
+ PAGE_STALE,
+ PAGE_CLAIMED, /* by either reclaim or free */
+};
+
+/*
+ * handle flags, go under HANDLE_FLAG_MASK
+ */
+enum z3fold_handle_flags {
+ HANDLES_NOFREE = 0,
+};
+
+/*
+ * Forward declarations
+ */
+static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
+static void compact_page_work(struct work_struct *w);
+
+/*****************
+ * Helpers
+*****************/
+
+/* Converts an allocation size in bytes to size in z3fold chunks */
+static int size_to_chunks(size_t size)
+{
+ return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
+ gfp_t gfp)
+{
+ struct z3fold_buddy_slots *slots;
+
+ slots = kmem_cache_zalloc(pool->c_handle,
+ (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
+
+ if (slots) {
+ /* It will be freed separately in free_handle(). */
+ kmemleak_not_leak(slots);
+ slots->pool = (unsigned long)pool;
+ rwlock_init(&slots->lock);
+ }
+
+ return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+ spin_lock(&zhdr->page_lock);
+}
+
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+ spin_unlock(&zhdr->page_lock);
+}
+
+
+static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
+ bool lock)
+{
+ struct z3fold_buddy_slots *slots;
+ struct z3fold_header *zhdr;
+ int locked = 0;
+
+ if (!(handle & (1 << PAGE_HEADLESS))) {
+ slots = handle_to_slots(handle);
+ do {
+ unsigned long addr;
+
+ read_lock(&slots->lock);
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ if (lock)
+ locked = z3fold_page_trylock(zhdr);
+ read_unlock(&slots->lock);
+ if (locked)
+ break;
+ cpu_relax();
+ } while (lock);
+ } else {
+ zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
+ }
+
+ return zhdr;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, false);
+}
+
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long h)
+{
+ return __get_z3fold_header(h, true);
+}
+
+static inline void put_z3fold_header(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_HEADLESS, &page->private))
+ z3fold_page_unlock(zhdr);
+}
+
+static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
+{
+ struct z3fold_buddy_slots *slots;
+ int i;
+ bool is_free;
+
+ if (handle & (1 << PAGE_HEADLESS))
+ return;
+
+ if (WARN_ON(*(unsigned long *)handle == 0))
+ return;
+
+ slots = handle_to_slots(handle);
+ write_lock(&slots->lock);
+ *(unsigned long *)handle = 0;
+
+ if (test_bit(HANDLES_NOFREE, &slots->pool)) {
+ write_unlock(&slots->lock);
+ return; /* simple case, nothing else to do */
+ }
+
+ if (zhdr->slots != slots)
+ zhdr->foreign_handles--;
+
+ is_free = true;
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+ write_unlock(&slots->lock);
+
+ if (is_free) {
+ struct z3fold_pool *pool = slots_to_pool(slots);
+
+ if (zhdr->slots == slots)
+ zhdr->slots = NULL;
+ kmem_cache_free(pool->c_handle, slots);
+ }
+}
+
+static int z3fold_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type z3fold_fs = {
+ .name = "z3fold",
+ .init_fs_context = z3fold_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+ int ret = 0;
+
+ z3fold_mnt = kern_mount(&z3fold_fs);
+ if (IS_ERR(z3fold_mnt))
+ ret = PTR_ERR(z3fold_mnt);
+
+ return ret;
+}
+
+static void z3fold_unmount(void)
+{
+ kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
+ return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+ if (pool->inode)
+ iput(pool->inode);
+ }
+
+/* Initializes the z3fold header of a newly allocated z3fold page */
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
+ struct z3fold_pool *pool, gfp_t gfp)
+{
+ struct z3fold_header *zhdr = page_address(page);
+ struct z3fold_buddy_slots *slots;
+
+ INIT_LIST_HEAD(&page->lru);
+ clear_bit(PAGE_HEADLESS, &page->private);
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ clear_bit(PAGE_STALE, &page->private);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ if (headless)
+ return zhdr;
+
+ slots = alloc_slots(pool, gfp);
+ if (!slots)
+ return NULL;
+
+ spin_lock_init(&zhdr->page_lock);
+ kref_init(&zhdr->refcount);
+ zhdr->first_chunks = 0;
+ zhdr->middle_chunks = 0;
+ zhdr->last_chunks = 0;
+ zhdr->first_num = 0;
+ zhdr->start_middle = 0;
+ zhdr->cpu = -1;
+ zhdr->foreign_handles = 0;
+ zhdr->mapped_count = 0;
+ zhdr->slots = slots;
+ zhdr->pool = pool;
+ INIT_LIST_HEAD(&zhdr->buddy);
+ INIT_WORK(&zhdr->work, compact_page_work);
+ return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_z3fold_page(struct page *page, bool headless)
+{
+ if (!headless) {
+ lock_page(page);
+ __ClearPageMovable(page);
+ unlock_page(page);
+ }
+ ClearPagePrivate(page);
+ __free_page(page);
+}
+
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
+/*
+ * Encodes the handle of a particular buddy within a z3fold page
+ * Pool lock should be held as this function accesses first_num
+ */
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
+ struct z3fold_buddy_slots *slots,
+ enum buddy bud)
+{
+ unsigned long h = (unsigned long)zhdr;
+ int idx = 0;
+
+ /*
+ * For a headless page, its handle is its pointer with the extra
+ * PAGE_HEADLESS bit set
+ */
+ if (bud == HEADLESS)
+ return h | (1 << PAGE_HEADLESS);
+
+ /* otherwise, return pointer to encoded handle */
+ idx = __idx(zhdr, bud);
+ h += idx;
+ if (bud == LAST)
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+ write_lock(&slots->lock);
+ slots->slot[idx] = h;
+ write_unlock(&slots->lock);
+ return (unsigned long)&slots->slot[idx];
+}
+
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return __encode_handle(zhdr, zhdr->slots, bud);
+}
+
+/* only for LAST bud, returns zero otherwise */
+static unsigned short handle_to_chunks(unsigned long handle)
+{
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
+ unsigned long addr;
+
+ read_lock(&slots->lock);
+ addr = *(unsigned long *)handle;
+ read_unlock(&slots->lock);
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
+}
+
+/*
+ * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
+ * but that doesn't matter. because the masking will result in the
+ * correct buddy number.
+ */
+static enum buddy handle_to_buddy(unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_buddy_slots *slots = handle_to_slots(handle);
+ unsigned long addr;
+
+ read_lock(&slots->lock);
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
+ addr = *(unsigned long *)handle;
+ read_unlock(&slots->lock);
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+ return zhdr->pool;
+}
+
+static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct page *page = virt_to_page(zhdr);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+
+ WARN_ON(!list_empty(&zhdr->buddy));
+ set_bit(PAGE_STALE, &page->private);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
+
+ if (locked)
+ z3fold_page_unlock(zhdr);
+
+ spin_lock(&pool->stale_lock);
+ list_add(&zhdr->buddy, &pool->stale);
+ queue_work(pool->release_wq, &pool->work);
+ spin_unlock(&pool->stale_lock);
+}
+
+static void __attribute__((__unused__))
+ release_z3fold_page(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ __release_z3fold_page(zhdr, false);
+}
+
+static void release_z3fold_page_locked(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void release_z3fold_page_locked_list(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+
+ spin_lock(&pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void free_pages_work(struct work_struct *w)
+{
+ struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
+
+ spin_lock(&pool->stale_lock);
+ while (!list_empty(&pool->stale)) {
+ struct z3fold_header *zhdr = list_first_entry(&pool->stale,
+ struct z3fold_header, buddy);
+ struct page *page = virt_to_page(zhdr);
+
+ list_del(&zhdr->buddy);
+ if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
+ continue;
+ spin_unlock(&pool->stale_lock);
+ cancel_work_sync(&zhdr->work);
+ free_z3fold_page(page, false);
+ cond_resched();
+ spin_lock(&pool->stale_lock);
+ }
+ spin_unlock(&pool->stale_lock);
+}
+
+/*
+ * Returns the number of free chunks in a z3fold page.
+ * NB: can't be used with HEADLESS pages.
+ */
+static int num_free_chunks(struct z3fold_header *zhdr)
+{
+ int nfree;
+ /*
+ * If there is a middle object, pick up the bigger free space
+ * either before or after it. Otherwise just subtract the number
+ * of chunks occupied by the first and the last objects.
+ */
+ if (zhdr->middle_chunks != 0) {
+ int nfree_before = zhdr->first_chunks ?
+ 0 : zhdr->start_middle - ZHDR_CHUNKS;
+ int nfree_after = zhdr->last_chunks ?
+ 0 : TOTAL_CHUNKS -
+ (zhdr->start_middle + zhdr->middle_chunks);
+ nfree = max(nfree_before, nfree_after);
+ } else
+ nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+ return nfree;
+}
+
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+ struct z3fold_header *zhdr)
+{
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+ int freechunks = num_free_chunks(zhdr);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
+ }
+}
+
+static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
+{
+ enum buddy bud = HEADLESS;
+
+ if (zhdr->middle_chunks) {
+ if (!zhdr->first_chunks &&
+ chunks <= zhdr->start_middle - ZHDR_CHUNKS)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ } else {
+ if (!zhdr->first_chunks)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ else
+ bud = MIDDLE;
+ }
+
+ return bud;
+}
+
+static inline void *mchunk_memmove(struct z3fold_header *zhdr,
+ unsigned short dst_chunk)
+{
+ void *beg = zhdr;
+ return memmove(beg + (dst_chunk << CHUNK_SHIFT),
+ beg + (zhdr->start_middle << CHUNK_SHIFT),
+ zhdr->middle_chunks << CHUNK_SHIFT);
+}
+
+static inline bool buddy_single(struct z3fold_header *zhdr)
+{
+ return !((zhdr->first_chunks && zhdr->middle_chunks) ||
+ (zhdr->first_chunks && zhdr->last_chunks) ||
+ (zhdr->middle_chunks && zhdr->last_chunks));
+}
+
+static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
+{
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ void *p = zhdr;
+ unsigned long old_handle = 0;
+ size_t sz = 0;
+ struct z3fold_header *new_zhdr = NULL;
+ int first_idx = __idx(zhdr, FIRST);
+ int middle_idx = __idx(zhdr, MIDDLE);
+ int last_idx = __idx(zhdr, LAST);
+ unsigned short *moved_chunks = NULL;
+
+ /*
+ * No need to protect slots here -- all the slots are "local" and
+ * the page lock is already taken
+ */
+ if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
+ p += ZHDR_SIZE_ALIGNED;
+ sz = zhdr->first_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
+ moved_chunks = &zhdr->first_chunks;
+ } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
+ p += zhdr->start_middle << CHUNK_SHIFT;
+ sz = zhdr->middle_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
+ moved_chunks = &zhdr->middle_chunks;
+ } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
+ p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ sz = zhdr->last_chunks << CHUNK_SHIFT;
+ old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
+ moved_chunks = &zhdr->last_chunks;
+ }
+
+ if (sz > 0) {
+ enum buddy new_bud = HEADLESS;
+ short chunks = size_to_chunks(sz);
+ void *q;
+
+ new_zhdr = __z3fold_alloc(pool, sz, false);
+ if (!new_zhdr)
+ return NULL;
+
+ if (WARN_ON(new_zhdr == zhdr))
+ goto out_fail;
+
+ new_bud = get_free_buddy(new_zhdr, chunks);
+ q = new_zhdr;
+ switch (new_bud) {
+ case FIRST:
+ new_zhdr->first_chunks = chunks;
+ q += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ new_zhdr->middle_chunks = chunks;
+ new_zhdr->start_middle =
+ new_zhdr->first_chunks + ZHDR_CHUNKS;
+ q += new_zhdr->start_middle << CHUNK_SHIFT;
+ break;
+ case LAST:
+ new_zhdr->last_chunks = chunks;
+ q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
+ break;
+ default:
+ goto out_fail;
+ }
+ new_zhdr->foreign_handles++;
+ memcpy(q, p, sz);
+ write_lock(&zhdr->slots->lock);
+ *(unsigned long *)old_handle = (unsigned long)new_zhdr +
+ __idx(new_zhdr, new_bud);
+ if (new_bud == LAST)
+ *(unsigned long *)old_handle |=
+ (new_zhdr->last_chunks << BUDDY_SHIFT);
+ write_unlock(&zhdr->slots->lock);
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+
+ *moved_chunks = 0;
+ }
+
+ return new_zhdr;
+
+out_fail:
+ if (new_zhdr) {
+ if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else {
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
+ }
+ }
+ return NULL;
+
+}
+
+#define BIG_CHUNK_GAP 3
+/* Has to be called with lock held */
+static int z3fold_compact_page(struct z3fold_header *zhdr)
+{
+ struct page *page = virt_to_page(zhdr);
+
+ if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
+ return 0; /* can't move middle chunk, it's used */
+
+ if (unlikely(PageIsolated(page)))
+ return 0;
+
+ if (zhdr->middle_chunks == 0)
+ return 0; /* nothing to compact */
+
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /* move to the beginning */
+ mchunk_memmove(zhdr, ZHDR_CHUNKS);
+ zhdr->first_chunks = zhdr->middle_chunks;
+ zhdr->middle_chunks = 0;
+ zhdr->start_middle = 0;
+ zhdr->first_num++;
+ return 1;
+ }
+
+ /*
+ * moving data is expensive, so let's only do that if
+ * there's substantial gain (at least BIG_CHUNK_GAP chunks)
+ */
+ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
+ zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
+ BIG_CHUNK_GAP) {
+ mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
+ zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
+ return 1;
+ } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
+ TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
+ + zhdr->middle_chunks) >=
+ BIG_CHUNK_GAP) {
+ unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
+ zhdr->middle_chunks;
+ mchunk_memmove(zhdr, new_start);
+ zhdr->start_middle = new_start;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void do_compact_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ struct page *page;
+
+ page = virt_to_page(zhdr);
+ if (locked)
+ WARN_ON(z3fold_page_trylock(zhdr));
+ else
+ z3fold_page_lock(zhdr);
+ if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+
+ if (test_bit(PAGE_STALE, &page->private) ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+
+ if (!zhdr->foreign_handles && buddy_single(zhdr) &&
+ zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
+ }
+ return;
+ }
+
+ z3fold_compact_page(zhdr);
+ add_to_unbuddied(pool, zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
+}
+
+static void compact_page_work(struct work_struct *w)
+{
+ struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
+ work);
+
+ do_compact_page(zhdr, false);
+}
+
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+ size_t size, bool can_sleep)
+{
+ struct z3fold_header *zhdr = NULL;
+ struct page *page;
+ struct list_head *unbuddied;
+ int chunks = size_to_chunks(size), i;
+
+lookup:
+ /* First, try to find an unbuddied z3fold page. */
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ for_each_unbuddied_list(i, chunks) {
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr)
+ continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (!zhdr) {
+ int cpu;
+
+ /* look for _exact_ match on other cpus' lists */
+ for_each_online_cpu(cpu) {
+ struct list_head *l;
+
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+ spin_lock(&pool->lock);
+ l = &unbuddied[chunks];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ if (can_sleep)
+ cond_resched();
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ }
+
+ if (zhdr && !zhdr->slots)
+ zhdr->slots = alloc_slots(pool,
+ can_sleep ? GFP_NOIO : GFP_ATOMIC);
+ return zhdr;
+}
+
+/*
+ * API Functions
+ */
+
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @name: pool name
+ * @gfp: gfp flags when allocating the z3fold pool structure
+ * @ops: user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
+ const struct z3fold_ops *ops)
+{
+ struct z3fold_pool *pool = NULL;
+ int i, cpu;
+
+ pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+ if (!pool)
+ goto out;
+ pool->c_handle = kmem_cache_create("z3fold_handle",
+ sizeof(struct z3fold_buddy_slots),
+ SLOTS_ALIGN, 0, NULL);
+ if (!pool->c_handle)
+ goto out_c;
+ spin_lock_init(&pool->lock);
+ spin_lock_init(&pool->stale_lock);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ if (!pool->unbuddied)
+ goto out_pool;
+ for_each_possible_cpu(cpu) {
+ struct list_head *unbuddied =
+ per_cpu_ptr(pool->unbuddied, cpu);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&unbuddied[i]);
+ }
+ INIT_LIST_HEAD(&pool->lru);
+ INIT_LIST_HEAD(&pool->stale);
+ atomic64_set(&pool->pages_nr, 0);
+ pool->name = name;
+ pool->compact_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->compact_wq)
+ goto out_unbuddied;
+ pool->release_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->release_wq)
+ goto out_wq;
+ if (z3fold_register_migration(pool))
+ goto out_rwq;
+ INIT_WORK(&pool->work, free_pages_work);
+ pool->ops = ops;
+ return pool;
+
+out_rwq:
+ destroy_workqueue(pool->release_wq);
+out_wq:
+ destroy_workqueue(pool->compact_wq);
+out_unbuddied:
+ free_percpu(pool->unbuddied);
+out_pool:
+ kmem_cache_destroy(pool->c_handle);
+out_c:
+ kfree(pool);
+out:
+ return NULL;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool: the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+ kmem_cache_destroy(pool->c_handle);
+
+ /*
+ * We need to destroy pool->compact_wq before pool->release_wq,
+ * as any pending work on pool->compact_wq will call
+ * queue_work(pool->release_wq, &pool->work).
+ *
+ * There are still outstanding pages until both workqueues are drained,
+ * so we cannot unregister migration until then.
+ */
+
+ destroy_workqueue(pool->compact_wq);
+ destroy_workqueue(pool->release_wq);
+ z3fold_unregister_migration(pool);
+ free_percpu(pool->unbuddied);
+ kfree(pool);
+}
+
+/**
+ * z3fold_alloc() - allocates a region of a given size
+ * @pool: z3fold pool from which to allocate
+ * @size: size in bytes of the desired allocation
+ * @gfp: gfp flags used if the pool needs to grow
+ * @handle: handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request. A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as z3fold pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ int chunks = size_to_chunks(size);
+ struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
+ enum buddy bud;
+ bool can_sleep = gfpflags_allow_blocking(gfp);
+
+ if (!size)
+ return -EINVAL;
+
+ if (size > PAGE_SIZE)
+ return -ENOSPC;
+
+ if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+ bud = HEADLESS;
+ else {
+retry:
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
+ if (zhdr) {
+ bud = get_free_buddy(zhdr, chunks);
+ if (bud == HEADLESS) {
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
+ pr_err("No free chunks in unbuddied\n");
+ WARN_ON(1);
+ goto retry;
+ }
+ page = virt_to_page(zhdr);
+ goto found;
+ }
+ bud = FIRST;
+ }
+
+ page = NULL;
+ if (can_sleep) {
+ spin_lock(&pool->stale_lock);
+ zhdr = list_first_entry_or_null(&pool->stale,
+ struct z3fold_header, buddy);
+ /*
+ * Before allocating a page, let's see if we can take one from
+ * the stale pages list. cancel_work_sync() can sleep so we
+ * limit this case to the contexts where we can sleep
+ */
+ if (zhdr) {
+ list_del(&zhdr->buddy);
+ spin_unlock(&pool->stale_lock);
+ cancel_work_sync(&zhdr->work);
+ page = virt_to_page(zhdr);
+ } else {
+ spin_unlock(&pool->stale_lock);
+ }
+ }
+ if (!page)
+ page = alloc_page(gfp);
+
+ if (!page)
+ return -ENOMEM;
+
+ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
+ if (!zhdr) {
+ __free_page(page);
+ return -ENOMEM;
+ }
+ atomic64_inc(&pool->pages_nr);
+
+ if (bud == HEADLESS) {
+ set_bit(PAGE_HEADLESS, &page->private);
+ goto headless;
+ }
+ if (can_sleep) {
+ lock_page(page);
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } else {
+ if (trylock_page(page)) {
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ }
+ }
+ z3fold_page_lock(zhdr);
+
+found:
+ if (bud == FIRST)
+ zhdr->first_chunks = chunks;
+ else if (bud == LAST)
+ zhdr->last_chunks = chunks;
+ else {
+ zhdr->middle_chunks = chunks;
+ zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
+ }
+ add_to_unbuddied(pool, zhdr);
+
+headless:
+ spin_lock(&pool->lock);
+ /* Add/move z3fold page to beginning of LRU */
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+
+ list_add(&page->lru, &pool->lru);
+
+ *handle = encode_handle(zhdr, bud);
+ spin_unlock(&pool->lock);
+ if (bud != HEADLESS)
+ z3fold_page_unlock(zhdr);
+
+ return 0;
+}
+
+/**
+ * z3fold_free() - frees the allocation associated with the given handle
+ * @pool: pool in which the allocation resided
+ * @handle: handle associated with the allocation returned by z3fold_alloc()
+ *
+ * In the case that the z3fold page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0. The page is actually freed
+ * once both buddies are evicted (see z3fold_reclaim_page() below).
+ */
+static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ enum buddy bud;
+ bool page_claimed;
+
+ zhdr = get_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
+
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ /* if a headless page is under reclaim, just leave.
+ * NB: we use test_and_set_bit for a reason: if the bit
+ * has not been set before, we release this page
+ * immediately so we don't care about its value any more.
+ */
+ if (!page_claimed) {
+ spin_lock(&pool->lock);
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ put_z3fold_header(zhdr);
+ free_z3fold_page(page, true);
+ atomic64_dec(&pool->pages_nr);
+ }
+ return;
+ }
+
+ /* Non-headless case */
+ bud = handle_to_buddy(handle);
+
+ switch (bud) {
+ case FIRST:
+ zhdr->first_chunks = 0;
+ break;
+ case MIDDLE:
+ zhdr->middle_chunks = 0;
+ break;
+ case LAST:
+ zhdr->last_chunks = 0;
+ break;
+ default:
+ pr_err("%s: unknown bud %d\n", __func__, bud);
+ WARN_ON(1);
+ put_z3fold_header(zhdr);
+ return;
+ }
+
+ if (!page_claimed)
+ free_handle(handle, zhdr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ if (page_claimed) {
+ /* the page has not been claimed by us */
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ put_z3fold_header(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ return;
+ }
+ if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
+ spin_lock(&pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = -1;
+ kref_get(&zhdr->refcount);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ do_compact_page(zhdr, true);
+ return;
+ }
+ kref_get(&zhdr->refcount);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ put_z3fold_header(zhdr);
+}
+
+/**
+ * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool: pool from which a page will attempt to be evicted
+ * @retries: number of pages on the LRU list for which eviction will
+ * be attempted before failing
+ *
+ * z3fold reclaim is different from normal system reclaim in that it is done
+ * from the bottom, up. This is because only the bottom layer, z3fold, has
+ * information on how the allocations are organized within each z3fold page.
+ * This has the potential to create interesting locking situations between
+ * z3fold and the user, however.
+ *
+ * To avoid these, this is how z3fold_reclaim_page() should be called:
+ *
+ * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
+ * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
+ * call the user-defined eviction handler with the pool and handle as
+ * arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
+ * appropriate list and try the next z3fold page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the z3fold page are successfully evicted, then the
+ * z3fold page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
+{
+ int i, ret = -1;
+ struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
+ struct list_head *pos;
+ unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+ struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
+
+ rwlock_init(&slots.lock);
+ slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
+
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ for (i = 0; i < retries; i++) {
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ list_for_each_prev(pos, &pool->lru) {
+ page = list_entry(pos, struct page, lru);
+
+ zhdr = page_address(page);
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ /*
+ * For non-headless pages, we wait to do this
+ * until we have the page lock to avoid racing
+ * with __z3fold_alloc(). Headless pages don't
+ * have a lock (and __z3fold_alloc() will never
+ * see them), but we still need to test and set
+ * PAGE_CLAIMED to avoid racing with
+ * z3fold_free(), so just do it now before
+ * leaving the loop.
+ */
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ continue;
+
+ break;
+ }
+
+ if (kref_get_unless_zero(&zhdr->refcount) == 0) {
+ zhdr = NULL;
+ break;
+ }
+ if (!z3fold_page_trylock(zhdr)) {
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
+ zhdr = NULL;
+ continue; /* can't evict at this point */
+ }
+
+ /* test_and_set_bit is of course atomic, but we still
+ * need to do it under page lock, otherwise checking
+ * that bit in __z3fold_alloc wouldn't make sense
+ */
+ if (zhdr->foreign_handles ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ continue; /* can't evict such page */
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ break;
+ }
+
+ if (!zhdr)
+ break;
+
+ list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
+
+ if (!test_bit(PAGE_HEADLESS, &page->private)) {
+ /*
+ * We need encode the handles before unlocking, and
+ * use our local slots structure because z3fold_free
+ * can zero out zhdr->slots and we can't do much
+ * about that
+ */
+ first_handle = 0;
+ last_handle = 0;
+ middle_handle = 0;
+ memset(slots.slot, 0, sizeof(slots.slot));
+ if (zhdr->first_chunks)
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
+ if (zhdr->middle_chunks)
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
+ if (zhdr->last_chunks)
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
+ /*
+ * it's safe to unlock here because we hold a
+ * reference to this page
+ */
+ z3fold_page_unlock(zhdr);
+ } else {
+ first_handle = encode_handle(zhdr, HEADLESS);
+ last_handle = middle_handle = 0;
+ }
+ /* Issue the eviction callback(s) */
+ if (middle_handle) {
+ ret = pool->ops->evict(pool, middle_handle);
+ if (ret)
+ goto next;
+ }
+ if (first_handle) {
+ ret = pool->ops->evict(pool, first_handle);
+ if (ret)
+ goto next;
+ }
+ if (last_handle) {
+ ret = pool->ops->evict(pool, last_handle);
+ if (ret)
+ goto next;
+ }
+next:
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ if (ret == 0) {
+ free_z3fold_page(page, true);
+ atomic64_dec(&pool->pages_nr);
+ return 0;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ } else {
+ struct z3fold_buddy_slots *slots = zhdr->slots;
+ z3fold_page_lock(zhdr);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page_locked)) {
+ kmem_cache_free(pool->c_handle, slots);
+ atomic64_dec(&pool->pages_nr);
+ return 0;
+ }
+ /*
+ * if we are here, the page is still not completely
+ * free. Take the global pool lock then to be able
+ * to add it back to the lru list
+ */
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ }
+
+ /* We started off locked to we need to lock the pool back */
+ spin_lock(&pool->lock);
+ }
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
+/**
+ * z3fold_map() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be mapped
+ *
+ * Extracts the buddy number from handle and constructs the pointer to the
+ * correct starting chunk within the page.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ void *addr;
+ enum buddy buddy;
+
+ zhdr = get_z3fold_header(handle);
+ addr = zhdr;
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ goto out;
+
+ buddy = handle_to_buddy(handle);
+ switch (buddy) {
+ case FIRST:
+ addr += ZHDR_SIZE_ALIGNED;
+ break;
+ case MIDDLE:
+ addr += zhdr->start_middle << CHUNK_SHIFT;
+ set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ break;
+ case LAST:
+ addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
+ break;
+ default:
+ pr_err("unknown buddy id %d\n", buddy);
+ WARN_ON(1);
+ addr = NULL;
+ break;
+ }
+
+ if (addr)
+ zhdr->mapped_count++;
+out:
+ put_z3fold_header(zhdr);
+ return addr;
+}
+
+/**
+ * z3fold_unmap() - unmaps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be unmapped
+ */
+static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+ enum buddy buddy;
+
+ zhdr = get_z3fold_header(handle);
+ page = virt_to_page(zhdr);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ return;
+
+ buddy = handle_to_buddy(handle);
+ if (buddy == MIDDLE)
+ clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ zhdr->mapped_count--;
+ put_z3fold_header(zhdr);
+}
+
+/**
+ * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * @pool: pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.
+ */
+static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+{
+ return atomic64_read(&pool->pages_nr);
+}
+
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ return false;
+
+ zhdr = page_address(page);
+ z3fold_page_lock(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))
+ goto out;
+
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
+ goto out;
+
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ goto out;
+ pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ if (!list_empty(&page->lru))
+ list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
+
+ kref_get(&zhdr->refcount);
+ z3fold_page_unlock(zhdr);
+ return true;
+
+out:
+ z3fold_page_unlock(zhdr);
+ return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct z3fold_header *zhdr, *new_zhdr;
+ struct z3fold_pool *pool;
+ struct address_space *new_mapping;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ if (!z3fold_page_trylock(zhdr))
+ return -EAGAIN;
+ if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
+ z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ return -EBUSY;
+ }
+ if (work_pending(&zhdr->work)) {
+ z3fold_page_unlock(zhdr);
+ return -EAGAIN;
+ }
+ new_zhdr = page_address(newpage);
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
+ newpage->private = page->private;
+ page->private = 0;
+ z3fold_page_unlock(zhdr);
+ spin_lock_init(&new_zhdr->page_lock);
+ INIT_WORK(&new_zhdr->work, compact_page_work);
+ /*
+ * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
+ * so we only have to reinitialize it.
+ */
+ INIT_LIST_HEAD(&new_zhdr->buddy);
+ new_mapping = page_mapping(page);
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+
+ get_page(newpage);
+ z3fold_page_lock(new_zhdr);
+ if (new_zhdr->first_chunks)
+ encode_handle(new_zhdr, FIRST);
+ if (new_zhdr->last_chunks)
+ encode_handle(new_zhdr, LAST);
+ if (new_zhdr->middle_chunks)
+ encode_handle(new_zhdr, MIDDLE);
+ set_bit(NEEDS_COMPACTING, &newpage->private);
+ new_zhdr->cpu = smp_processor_id();
+ spin_lock(&pool->lock);
+ list_add(&newpage->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ __SetPageMovable(newpage, new_mapping);
+ z3fold_page_unlock(new_zhdr);
+
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+ page_mapcount_reset(page);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ put_page(page);
+ return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ z3fold_page_lock(zhdr);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ INIT_LIST_HEAD(&page->lru);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+ .isolate_page = z3fold_page_isolate,
+ .migratepage = z3fold_page_migrate,
+ .putback_page = z3fold_page_putback,
+};
+
+/*****************
+ * zpool
+ ****************/
+
+static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct z3fold_ops z3fold_zpool_ops = {
+ .evict = z3fold_zpool_evict
+};
+
+static void *z3fold_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct z3fold_pool *pool;
+
+ pool = z3fold_create_pool(name, gfp,
+ zpool_ops ? &z3fold_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void z3fold_zpool_destroy(void *pool)
+{
+ z3fold_destroy_pool(pool);
+}
+
+static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return z3fold_alloc(pool, size, gfp, handle);
+}
+static void z3fold_zpool_free(void *pool, unsigned long handle)
+{
+ z3fold_free(pool, handle);
+}
+
+static int z3fold_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = z3fold_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *z3fold_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return z3fold_map(pool, handle);
+}
+static void z3fold_zpool_unmap(void *pool, unsigned long handle)
+{
+ z3fold_unmap(pool, handle);
+}
+
+static u64 z3fold_zpool_total_size(void *pool)
+{
+ return z3fold_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver z3fold_zpool_driver = {
+ .type = "z3fold",
+ .owner = THIS_MODULE,
+ .create = z3fold_zpool_create,
+ .destroy = z3fold_zpool_destroy,
+ .malloc = z3fold_zpool_malloc,
+ .free = z3fold_zpool_free,
+ .shrink = z3fold_zpool_shrink,
+ .map = z3fold_zpool_map,
+ .unmap = z3fold_zpool_unmap,
+ .total_size = z3fold_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-z3fold");
+
+static int __init init_z3fold(void)
+{
+ int ret;
+
+ /* Make sure the z3fold header is not larger than the page size */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ ret = z3fold_mount();
+ if (ret)
+ return ret;
+
+ zpool_register_driver(&z3fold_zpool_driver);
+
+ return 0;
+}
+
+static void __exit exit_z3fold(void)
+{
+ z3fold_unmount();
+ zpool_unregister_driver(&z3fold_zpool_driver);
+}
+
+module_init(init_z3fold);
+module_exit(exit_z3fold);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
+MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000..c49966ece
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,636 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages. Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page". The first buddy is "left
+ * justified" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page. The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1. This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks". The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer. The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+#include <linux/zpool.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
+ * 63 which shows the max number of free chunks in zbud page, also there will be
+ * 63 freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock: protects all pool fields and first|last_chunk fields of any
+ * zbud page in the pool
+ * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
+ * the lists each zbud page is added to depends on the size of
+ * its free region.
+ * @buddied: list tracking the zbud pages that contain two buddies;
+ * these zbud pages are full
+ * @lru: list tracking the zbud pages in LRU order by most recently
+ * added buddy.
+ * @pages_nr: number of zbud pages in the pool.
+ * @ops: pointer to a structure of user defined operations specified at
+ * pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+ spinlock_t lock;
+ struct list_head unbuddied[NCHUNKS];
+ struct list_head buddied;
+ struct list_head lru;
+ u64 pages_nr;
+ const struct zbud_ops *ops;
+#ifdef CONFIG_ZPOOL
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+#endif
+};
+
+/*
+ * struct zbud_header - zbud page metadata occupying the first chunk of each
+ * zbud page.
+ * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru: links the zbud page into the lru list in the pool
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ */
+struct zbud_header {
+ struct list_head buddy;
+ struct list_head lru;
+ unsigned int first_chunks;
+ unsigned int last_chunks;
+ bool under_reclaim;
+};
+
+/*****************
+ * zpool
+ ****************/
+
+#ifdef CONFIG_ZPOOL
+
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct zbud_ops zbud_zpool_ops = {
+ .evict = zbud_zpool_evict
+};
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct zbud_pool *pool;
+
+ pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+ zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+ zbud_free(pool, handle);
+}
+
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zbud_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+ zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+ return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+ .type = "zbud",
+ .owner = THIS_MODULE,
+ .create = zbud_zpool_create,
+ .destroy = zbud_zpool_destroy,
+ .malloc = zbud_zpool_malloc,
+ .free = zbud_zpool_free,
+ .shrink = zbud_zpool_shrink,
+ .map = zbud_zpool_map,
+ .unmap = zbud_zpool_unmap,
+ .total_size = zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+#endif /* CONFIG_ZPOOL */
+
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+ FIRST,
+ LAST
+};
+
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(size_t size)
+{
+ return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the zbud header of a newly allocated zbud page */
+static struct zbud_header *init_zbud_page(struct page *page)
+{
+ struct zbud_header *zhdr = page_address(page);
+ zhdr->first_chunks = 0;
+ zhdr->last_chunks = 0;
+ INIT_LIST_HEAD(&zhdr->buddy);
+ INIT_LIST_HEAD(&zhdr->lru);
+ zhdr->under_reclaim = false;
+ return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_header *zhdr)
+{
+ __free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
+{
+ unsigned long handle;
+
+ /*
+ * For now, the encoded handle is actually just the pointer to the data
+ * but this might not always be the case. A little information hiding.
+ * Add CHUNK_SIZE to the handle if it is the first allocation to jump
+ * over the zbud header in the first chunk.
+ */
+ handle = (unsigned long)zhdr;
+ if (bud == FIRST)
+ /* skip over zbud header */
+ handle += ZHDR_SIZE_ALIGNED;
+ else /* bud == LAST */
+ handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ return handle;
+}
+
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_header *handle_to_zbud_header(unsigned long handle)
+{
+ return (struct zbud_header *)(handle & PAGE_MASK);
+}
+
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_header *zhdr)
+{
+ /*
+ * Rather than branch for different situations, just use the fact that
+ * free buddies have a length of zero to simplify everything.
+ */
+ return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp: gfp flags when allocating the zbud pool structure
+ * @ops: user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+{
+ struct zbud_pool *pool;
+ int i;
+
+ pool = kzalloc(sizeof(struct zbud_pool), gfp);
+ if (!pool)
+ return NULL;
+ spin_lock_init(&pool->lock);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&pool->unbuddied[i]);
+ INIT_LIST_HEAD(&pool->buddied);
+ INIT_LIST_HEAD(&pool->lru);
+ pool->pages_nr = 0;
+ pool->ops = ops;
+ return pool;
+}
+
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool: the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+ kfree(pool);
+}
+
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool: zbud pool from which to allocate
+ * @size: size in bytes of the desired allocation
+ * @gfp: gfp flags used if the pool needs to grow
+ * @handle: handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request. A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ int chunks, i, freechunks;
+ struct zbud_header *zhdr = NULL;
+ enum buddy bud;
+ struct page *page;
+
+ if (!size || (gfp & __GFP_HIGHMEM))
+ return -EINVAL;
+ if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+ return -ENOSPC;
+ chunks = size_to_chunks(size);
+ spin_lock(&pool->lock);
+
+ /* First, try to find an unbuddied zbud page. */
+ for_each_unbuddied_list(i, chunks) {
+ if (!list_empty(&pool->unbuddied[i])) {
+ zhdr = list_first_entry(&pool->unbuddied[i],
+ struct zbud_header, buddy);
+ list_del(&zhdr->buddy);
+ if (zhdr->first_chunks == 0)
+ bud = FIRST;
+ else
+ bud = LAST;
+ goto found;
+ }
+ }
+
+ /* Couldn't find unbuddied zbud page, create new one */
+ spin_unlock(&pool->lock);
+ page = alloc_page(gfp);
+ if (!page)
+ return -ENOMEM;
+ spin_lock(&pool->lock);
+ pool->pages_nr++;
+ zhdr = init_zbud_page(page);
+ bud = FIRST;
+
+found:
+ if (bud == FIRST)
+ zhdr->first_chunks = chunks;
+ else
+ zhdr->last_chunks = chunks;
+
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* Add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+ /* Add/move zbud page to beginning of LRU */
+ if (!list_empty(&zhdr->lru))
+ list_del(&zhdr->lru);
+ list_add(&zhdr->lru, &pool->lru);
+
+ *handle = encode_handle(zhdr, bud);
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool: pool in which the allocation resided
+ * @handle: handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0. The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+ struct zbud_header *zhdr;
+ int freechunks;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_zbud_header(handle);
+
+ /* If first buddy, handle will be page aligned */
+ if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
+ zhdr->last_chunks = 0;
+ else
+ zhdr->first_chunks = 0;
+
+ if (zhdr->under_reclaim) {
+ /* zbud page is under reclaim, reclaim will free */
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ /* Remove from existing buddy list */
+ list_del(&zhdr->buddy);
+
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /* zbud page is empty, free */
+ list_del(&zhdr->lru);
+ free_zbud_page(zhdr);
+ pool->pages_nr--;
+ } else {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ spin_unlock(&pool->lock);
+}
+
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool: pool from which a page will attempt to be evicted
+ * @retries: number of pages on the LRU list for which eviction will
+ * be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up. This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+ *
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+ int i, ret, freechunks;
+ struct zbud_header *zhdr;
+ unsigned long first_handle = 0, last_handle = 0;
+
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ for (i = 0; i < retries; i++) {
+ zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
+ list_del(&zhdr->lru);
+ list_del(&zhdr->buddy);
+ /* Protect zbud page against free */
+ zhdr->under_reclaim = true;
+ /*
+ * We need encode the handles before unlocking, since we can
+ * race with free that will set (first|last)_chunks to 0
+ */
+ first_handle = 0;
+ last_handle = 0;
+ if (zhdr->first_chunks)
+ first_handle = encode_handle(zhdr, FIRST);
+ if (zhdr->last_chunks)
+ last_handle = encode_handle(zhdr, LAST);
+ spin_unlock(&pool->lock);
+
+ /* Issue the eviction callback(s) */
+ if (first_handle) {
+ ret = pool->ops->evict(pool, first_handle);
+ if (ret)
+ goto next;
+ }
+ if (last_handle) {
+ ret = pool->ops->evict(pool, last_handle);
+ if (ret)
+ goto next;
+ }
+next:
+ spin_lock(&pool->lock);
+ zhdr->under_reclaim = false;
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /*
+ * Both buddies are now free, free the zbud page and
+ * return success.
+ */
+ free_zbud_page(zhdr);
+ pool->pages_nr--;
+ spin_unlock(&pool->lock);
+ return 0;
+ } else if (zhdr->first_chunks == 0 ||
+ zhdr->last_chunks == 0) {
+ /* add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+ /* add to beginning of LRU */
+ list_add(&zhdr->lru, &pool->lru);
+ }
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+ return (void *)(handle);
+}
+
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool: pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool. The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+ return pool->pages_nr;
+}
+
+static int __init init_zbud(void)
+{
+ /* Make sure the zbud header will fit in one chunk */
+ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
+ pr_info("loaded\n");
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zbud_zpool_driver);
+#endif
+
+ return 0;
+}
+
+static void __exit exit_zbud(void)
+{
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zbud_zpool_driver);
+#endif
+
+ pr_info("unloaded\n");
+}
+
+module_init(init_zbud);
+module_exit(exit_zbud);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zpool.c b/mm/zpool.c
new file mode 100644
index 000000000..3744a2d1a
--- /dev/null
+++ b/mm/zpool.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * zpool memory storage api
+ *
+ * Copyright (C) 2014 Dan Streetman
+ *
+ * This is a common frontend for memory storage pool implementations.
+ * Typically, this is used to store compressed memory.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/zpool.h>
+
+struct zpool {
+ struct zpool_driver *driver;
+ void *pool;
+ const struct zpool_ops *ops;
+ bool evictable;
+
+ struct list_head list;
+};
+
+static LIST_HEAD(drivers_head);
+static DEFINE_SPINLOCK(drivers_lock);
+
+static LIST_HEAD(pools_head);
+static DEFINE_SPINLOCK(pools_lock);
+
+/**
+ * zpool_register_driver() - register a zpool implementation.
+ * @driver: driver to register
+ */
+void zpool_register_driver(struct zpool_driver *driver)
+{
+ spin_lock(&drivers_lock);
+ atomic_set(&driver->refcount, 0);
+ list_add(&driver->list, &drivers_head);
+ spin_unlock(&drivers_lock);
+}
+EXPORT_SYMBOL(zpool_register_driver);
+
+/**
+ * zpool_unregister_driver() - unregister a zpool implementation.
+ * @driver: driver to unregister.
+ *
+ * Module usage counting is used to prevent using a driver
+ * while/after unloading, so if this is called from module
+ * exit function, this should never fail; if called from
+ * other than the module exit function, and this returns
+ * failure, the driver is in use and must remain available.
+ */
+int zpool_unregister_driver(struct zpool_driver *driver)
+{
+ int ret = 0, refcount;
+
+ spin_lock(&drivers_lock);
+ refcount = atomic_read(&driver->refcount);
+ WARN_ON(refcount < 0);
+ if (refcount > 0)
+ ret = -EBUSY;
+ else
+ list_del(&driver->list);
+ spin_unlock(&drivers_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(zpool_unregister_driver);
+
+/* this assumes @type is null-terminated. */
+static struct zpool_driver *zpool_get_driver(const char *type)
+{
+ struct zpool_driver *driver;
+
+ spin_lock(&drivers_lock);
+ list_for_each_entry(driver, &drivers_head, list) {
+ if (!strcmp(driver->type, type)) {
+ bool got = try_module_get(driver->owner);
+
+ if (got)
+ atomic_inc(&driver->refcount);
+ spin_unlock(&drivers_lock);
+ return got ? driver : NULL;
+ }
+ }
+
+ spin_unlock(&drivers_lock);
+ return NULL;
+}
+
+static void zpool_put_driver(struct zpool_driver *driver)
+{
+ atomic_dec(&driver->refcount);
+ module_put(driver->owner);
+}
+
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type: The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available. This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling. If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure. However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * The @type string must be null-terminated.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+ struct zpool_driver *driver = zpool_get_driver(type);
+
+ if (!driver) {
+ request_module("zpool-%s", type);
+ driver = zpool_get_driver(type);
+ }
+
+ if (!driver)
+ return false;
+
+ zpool_put_driver(driver);
+ return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
+/**
+ * zpool_create_pool() - Create a new zpool
+ * @type: The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name: The name of the zpool (e.g. zram0, zswap)
+ * @gfp: The GFP flags to use when allocating the pool.
+ * @ops: The optional ops callback.
+ *
+ * This creates a new zpool of the specified type. The gfp flags will be
+ * used when allocating memory, if the implementation supports it. If the
+ * ops param is NULL, then the created zpool will not be evictable.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * The @type and @name strings must be null-terminated.
+ *
+ * Returns: New zpool on success, NULL on failure.
+ */
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
+ const struct zpool_ops *ops)
+{
+ struct zpool_driver *driver;
+ struct zpool *zpool;
+
+ pr_debug("creating pool type %s\n", type);
+
+ driver = zpool_get_driver(type);
+
+ if (!driver) {
+ request_module("zpool-%s", type);
+ driver = zpool_get_driver(type);
+ }
+
+ if (!driver) {
+ pr_err("no driver for type %s\n", type);
+ return NULL;
+ }
+
+ zpool = kmalloc(sizeof(*zpool), gfp);
+ if (!zpool) {
+ pr_err("couldn't create zpool - out of memory\n");
+ zpool_put_driver(driver);
+ return NULL;
+ }
+
+ zpool->driver = driver;
+ zpool->pool = driver->create(name, gfp, ops, zpool);
+ zpool->ops = ops;
+ zpool->evictable = driver->shrink && ops && ops->evict;
+
+ if (!zpool->pool) {
+ pr_err("couldn't create %s pool\n", type);
+ zpool_put_driver(driver);
+ kfree(zpool);
+ return NULL;
+ }
+
+ pr_debug("created pool type %s\n", type);
+
+ spin_lock(&pools_lock);
+ list_add(&zpool->list, &pools_head);
+ spin_unlock(&pools_lock);
+
+ return zpool;
+}
+
+/**
+ * zpool_destroy_pool() - Destroy a zpool
+ * @zpool: The zpool to destroy.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when destroying different pools. The same
+ * pool should only be destroyed once, and should not be used
+ * after it is destroyed.
+ *
+ * This destroys an existing zpool. The zpool should not be in use.
+ */
+void zpool_destroy_pool(struct zpool *zpool)
+{
+ pr_debug("destroying pool type %s\n", zpool->driver->type);
+
+ spin_lock(&pools_lock);
+ list_del(&zpool->list);
+ spin_unlock(&pools_lock);
+ zpool->driver->destroy(zpool->pool);
+ zpool_put_driver(zpool->driver);
+ kfree(zpool);
+}
+
+/**
+ * zpool_get_type() - Get the type of the zpool
+ * @zpool: The zpool to check
+ *
+ * This returns the type of the pool.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: The type of zpool.
+ */
+const char *zpool_get_type(struct zpool *zpool)
+{
+ return zpool->driver->type;
+}
+
+/**
+ * zpool_malloc_support_movable() - Check if the zpool supports
+ * allocating movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool supports allocating movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if the zpool supports allocating movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+ return zpool->driver->malloc_support_movable;
+}
+
+/**
+ * zpool_malloc() - Allocate memory
+ * @zpool: The zpool to allocate from.
+ * @size: The amount of memory to allocate.
+ * @gfp: The GFP flags to use when allocating memory.
+ * @handle: Pointer to the handle to set
+ *
+ * This allocates the requested amount of memory from the pool.
+ * The gfp flags will be used when allocating memory, if the
+ * implementation supports it. The provided @handle will be
+ * set to the allocated object handle.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error.
+ */
+int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+}
+
+/**
+ * zpool_free() - Free previously allocated memory
+ * @zpool: The zpool that allocated the memory.
+ * @handle: The handle to the memory to free.
+ *
+ * This frees previously allocated memory. This does not guarantee
+ * that the pool will actually free memory, only that the memory
+ * in the pool will become available for use by the pool.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when freeing different handles. The same
+ * handle should only be freed once, and should not be used
+ * after freeing.
+ */
+void zpool_free(struct zpool *zpool, unsigned long handle)
+{
+ zpool->driver->free(zpool->pool, handle);
+}
+
+/**
+ * zpool_shrink() - Shrink the pool size
+ * @zpool: The zpool to shrink.
+ * @pages: The number of pages to shrink the pool.
+ * @reclaimed: The number of pages successfully evicted.
+ *
+ * This attempts to shrink the actual memory size of the pool
+ * by evicting currently used handle(s). If the pool was
+ * created with no zpool_ops, or the evict call fails for any
+ * of the handles, this will fail. If non-NULL, the @reclaimed
+ * parameter will be set to the number of pages reclaimed,
+ * which may be more than the number of pages requested.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error/failure.
+ */
+int zpool_shrink(struct zpool *zpool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ return zpool->driver->shrink ?
+ zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
+}
+
+/**
+ * zpool_map_handle() - Map a previously allocated handle into memory
+ * @zpool: The zpool that the handle was allocated from
+ * @handle: The handle to map
+ * @mapmode: How the memory should be mapped
+ *
+ * This maps a previously allocated handle into memory. The @mapmode
+ * param indicates to the implementation how the memory will be
+ * used, i.e. read-only, write-only, read-write. If the
+ * implementation does not support it, the memory will be treated
+ * as read-write.
+ *
+ * This may hold locks, disable interrupts, and/or preemption,
+ * and the zpool_unmap_handle() must be called to undo those
+ * actions. The code that uses the mapped handle should complete
+ * its operatons on the mapped handle memory quickly and unmap
+ * as soon as possible. As the implementation may use per-cpu
+ * data, multiple handles should not be mapped concurrently on
+ * any cpu.
+ *
+ * Returns: A pointer to the handle's mapped memory area.
+ */
+void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
+ enum zpool_mapmode mapmode)
+{
+ return zpool->driver->map(zpool->pool, handle, mapmode);
+}
+
+/**
+ * zpool_unmap_handle() - Unmap a previously mapped handle
+ * @zpool: The zpool that the handle was allocated from
+ * @handle: The handle to unmap
+ *
+ * This unmaps a previously mapped handle. Any locks or other
+ * actions that the implementation took in zpool_map_handle()
+ * will be undone here. The memory area returned from
+ * zpool_map_handle() should no longer be used after this.
+ */
+void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+{
+ zpool->driver->unmap(zpool->pool, handle);
+}
+
+/**
+ * zpool_get_total_size() - The total size of the pool
+ * @zpool: The zpool to check
+ *
+ * This returns the total size in bytes of the pool.
+ *
+ * Returns: Total size of the zpool in bytes.
+ */
+u64 zpool_get_total_size(struct zpool *zpool)
+{
+ return zpool->driver->total_size(zpool->pool);
+}
+
+/**
+ * zpool_evictable() - Test if zpool is potentially evictable
+ * @zpool: The zpool to test
+ *
+ * Zpool is only potentially evictable when it's created with struct
+ * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
+ *
+ * However, it doesn't necessarily mean driver will use zpool_ops.evict
+ * in its implementation of zpool_driver.shrink. It could do internal
+ * defragmentation instead.
+ *
+ * Returns: true if potentially evictable; false otherwise.
+ */
+bool zpool_evictable(struct zpool *zpool)
+{
+ return zpool->evictable;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
+MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000..c18dc8e61
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,2581 @@
+/*
+ * zsmalloc memory allocator
+ *
+ * Copyright (C) 2011 Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the license that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+/*
+ * Following is how we use various fields and flags of underlying
+ * struct page(s) to form a zspage.
+ *
+ * Usage of struct page fields:
+ * page->private: points to zspage
+ * page->freelist(index): links together all component pages of a zspage
+ * For the huge page, this is always 0, so we use this field
+ * to store handle.
+ * page->units: first object offset in a subpage of zspage
+ *
+ * Usage of struct page flags:
+ * PG_private: identifies the first component page
+ * PG_owner_priv_1: identifies the huge component page
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/vmalloc.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/zsmalloc.h>
+#include <linux/zpool.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/migrate.h>
+#include <linux/wait.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+
+#define ZSPAGE_MAGIC 0x58
+
+/*
+ * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * These two conditions ensure that any 'struct link_free' itself doesn't
+ * span more than 1 page which avoids complex case of mapping 2 pages simply
+ * to restore link_free pointer values.
+ */
+#define ZS_ALIGN 8
+
+/*
+ * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
+ * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
+ */
+#define ZS_MAX_ZSPAGE_ORDER 2
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
+/*
+ * Object location (<PFN>, <obj_idx>) is encoded as
+ * a single (unsigned long) handle value.
+ *
+ * Note that object index <obj_idx> starts from 0.
+ *
+ * This is made more complicated by various memory models and PAE.
+ */
+
+#ifndef MAX_POSSIBLE_PHYSMEM_BITS
+#ifdef MAX_PHYSMEM_BITS
+#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
+#else
+/*
+ * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * be PAGE_SHIFT
+ */
+#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
+#endif
+#endif
+
+#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
+#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+
+#define FULLNESS_BITS 2
+#define CLASS_BITS 8
+#define ISOLATED_BITS 3
+#define MAGIC_VAL_BITS 8
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
+#define ZS_MIN_ALLOC_SIZE \
+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
+#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
+
+/*
+ * On systems with 4K page size, this gives 255 size classes! There is a
+ * trader-off here:
+ * - Large number of size classes is potentially wasteful as free page are
+ * spread across these classes
+ * - Small number of size classes causes large internal fragmentation
+ * - Probably its better to use specific size classes (empirically
+ * determined). NOTE: all those class sizes must be set as multiple of
+ * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
+ *
+ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
+ * (reason above)
+ */
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
+#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
+ ZS_SIZE_CLASS_DELTA) + 1)
+
+enum fullness_group {
+ ZS_EMPTY,
+ ZS_ALMOST_EMPTY,
+ ZS_ALMOST_FULL,
+ ZS_FULL,
+ NR_ZS_FULLNESS,
+};
+
+enum zs_stat_type {
+ CLASS_EMPTY,
+ CLASS_ALMOST_EMPTY,
+ CLASS_ALMOST_FULL,
+ CLASS_FULL,
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ NR_ZS_STAT_TYPE,
+};
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
+#endif
+
+#ifdef CONFIG_COMPACTION
+static struct vfsmount *zsmalloc_mnt;
+#endif
+
+/*
+ * We assign a page to ZS_ALMOST_EMPTY fullness group when:
+ * n <= N / f, where
+ * n = number of allocated objects
+ * N = total number of objects zspage can store
+ * f = fullness_threshold_frac
+ *
+ * Similarly, we assign zspage to:
+ * ZS_ALMOST_FULL when n > N / f
+ * ZS_EMPTY when n == 0
+ * ZS_FULL when n == N
+ *
+ * (see: fix_fullness_group())
+ */
+static const int fullness_threshold_frac = 4;
+static size_t huge_class_size;
+
+struct size_class {
+ spinlock_t lock;
+ struct list_head fullness_list[NR_ZS_FULLNESS];
+ /*
+ * Size of objects stored in this class. Must be multiple
+ * of ZS_ALIGN.
+ */
+ int size;
+ int objs_per_zspage;
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
+
+ unsigned int index;
+ struct zs_size_stat stats;
+};
+
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetPageHugeObject(struct page *page)
+{
+ SetPageOwnerPriv1(page);
+}
+
+static void ClearPageHugeObject(struct page *page)
+{
+ ClearPageOwnerPriv1(page);
+}
+
+static int PageHugeObject(struct page *page)
+{
+ return PageOwnerPriv1(page);
+}
+
+/*
+ * Placed within free objects to form a singly linked list.
+ * For every zspage, zspage->freeobj gives head of this list.
+ *
+ * This must be power of 2 and less than or equal to ZS_ALIGN
+ */
+struct link_free {
+ union {
+ /*
+ * Free object index;
+ * It's valid for non-allocated object
+ */
+ unsigned long next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
+};
+
+struct zs_pool {
+ const char *name;
+
+ struct size_class *size_class[ZS_SIZE_CLASSES];
+ struct kmem_cache *handle_cachep;
+ struct kmem_cache *zspage_cachep;
+
+ atomic_long_t pages_allocated;
+
+ struct zs_pool_stats stats;
+
+ /* Compact classes */
+ struct shrinker shrinker;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
+#ifdef CONFIG_COMPACTION
+ struct inode *inode;
+ struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
+#endif
+};
+
+struct zspage {
+ struct {
+ unsigned int fullness:FULLNESS_BITS;
+ unsigned int class:CLASS_BITS + 1;
+ unsigned int isolated:ISOLATED_BITS;
+ unsigned int magic:MAGIC_VAL_BITS;
+ };
+ unsigned int inuse;
+ unsigned int freeobj;
+ struct page *first_page;
+ struct list_head list; /* fullness list */
+#ifdef CONFIG_COMPACTION
+ rwlock_t lock;
+#endif
+};
+
+struct mapping_area {
+ char *vm_buf; /* copy buffer for objects that span pages */
+ char *vm_addr; /* address of kmap_atomic()'ed pages */
+ enum zs_mapmode vm_mm; /* mapping mode */
+};
+
+#ifdef CONFIG_COMPACTION
+static int zs_register_migration(struct zs_pool *pool);
+static void zs_unregister_migration(struct zs_pool *pool);
+static void migrate_lock_init(struct zspage *zspage);
+static void migrate_read_lock(struct zspage *zspage);
+static void migrate_read_unlock(struct zspage *zspage);
+static void kick_deferred_free(struct zs_pool *pool);
+static void init_deferred_free(struct zs_pool *pool);
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
+#else
+static int zsmalloc_mount(void) { return 0; }
+static void zsmalloc_unmount(void) {}
+static int zs_register_migration(struct zs_pool *pool) { return 0; }
+static void zs_unregister_migration(struct zs_pool *pool) {}
+static void migrate_lock_init(struct zspage *zspage) {}
+static void migrate_read_lock(struct zspage *zspage) {}
+static void migrate_read_unlock(struct zspage *zspage) {}
+static void kick_deferred_free(struct zs_pool *pool) {}
+static void init_deferred_free(struct zs_pool *pool) {}
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
+static int create_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ if (!pool->handle_cachep)
+ return 1;
+
+ pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+ if (!pool->zspage_cachep) {
+ kmem_cache_destroy(pool->handle_cachep);
+ pool->handle_cachep = NULL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void destroy_cache(struct zs_pool *pool)
+{
+ kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->zspage_cachep);
+}
+
+static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+}
+
+static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
+{
+ return kmem_cache_alloc(pool->zspage_cachep,
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+}
+
+static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+{
+ kmem_cache_free(pool->zspage_cachep, zspage);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ /*
+ * lsb of @obj represents handle lock while other bits
+ * represent object value the handle is pointing so
+ * updating shouldn't do store tearing.
+ */
+ WRITE_ONCE(*(unsigned long *)handle, obj);
+}
+
+/* zpool driver */
+
+#ifdef CONFIG_ZPOOL
+
+static void *zs_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ /*
+ * Ignore global gfp flags: zs_malloc() may be invoked from
+ * different contexts and its caller must provide a valid
+ * gfp mask.
+ */
+ return zs_create_pool(name);
+}
+
+static void zs_zpool_destroy(void *pool)
+{
+ zs_destroy_pool(pool);
+}
+
+static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ *handle = zs_malloc(pool, size, gfp);
+ return *handle ? 0 : -1;
+}
+static void zs_zpool_free(void *pool, unsigned long handle)
+{
+ zs_free(pool, handle);
+}
+
+static void *zs_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ enum zs_mapmode zs_mm;
+
+ switch (mm) {
+ case ZPOOL_MM_RO:
+ zs_mm = ZS_MM_RO;
+ break;
+ case ZPOOL_MM_WO:
+ zs_mm = ZS_MM_WO;
+ break;
+ case ZPOOL_MM_RW:
+ default:
+ zs_mm = ZS_MM_RW;
+ break;
+ }
+
+ return zs_map_object(pool, handle, zs_mm);
+}
+static void zs_zpool_unmap(void *pool, unsigned long handle)
+{
+ zs_unmap_object(pool, handle);
+}
+
+static u64 zs_zpool_total_size(void *pool)
+{
+ return zs_get_total_pages(pool) << PAGE_SHIFT;
+}
+
+static struct zpool_driver zs_zpool_driver = {
+ .type = "zsmalloc",
+ .owner = THIS_MODULE,
+ .create = zs_zpool_create,
+ .destroy = zs_zpool_destroy,
+ .malloc_support_movable = true,
+ .malloc = zs_zpool_malloc,
+ .free = zs_zpool_free,
+ .map = zs_zpool_map,
+ .unmap = zs_zpool_unmap,
+ .total_size = zs_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zsmalloc");
+#endif /* CONFIG_ZPOOL */
+
+/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+
+static bool is_zspage_isolated(struct zspage *zspage)
+{
+ return zspage->isolated;
+}
+
+static __maybe_unused int is_first_page(struct page *page)
+{
+ return PagePrivate(page);
+}
+
+/* Protected by class->lock */
+static inline int get_zspage_inuse(struct zspage *zspage)
+{
+ return zspage->inuse;
+}
+
+
+static inline void mod_zspage_inuse(struct zspage *zspage, int val)
+{
+ zspage->inuse += val;
+}
+
+static inline struct page *get_first_page(struct zspage *zspage)
+{
+ struct page *first_page = zspage->first_page;
+
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ return first_page;
+}
+
+static inline int get_first_obj_offset(struct page *page)
+{
+ return page->units;
+}
+
+static inline void set_first_obj_offset(struct page *page, int offset)
+{
+ page->units = offset;
+}
+
+static inline unsigned int get_freeobj(struct zspage *zspage)
+{
+ return zspage->freeobj;
+}
+
+static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
+{
+ zspage->freeobj = obj;
+}
+
+static void get_zspage_mapping(struct zspage *zspage,
+ unsigned int *class_idx,
+ enum fullness_group *fullness)
+{
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
+
+ *fullness = zspage->fullness;
+ *class_idx = zspage->class;
+}
+
+static void set_zspage_mapping(struct zspage *zspage,
+ unsigned int class_idx,
+ enum fullness_group fullness)
+{
+ zspage->class = class_idx;
+ zspage->fullness = fullness;
+}
+
+/*
+ * zsmalloc divides the pool into various size classes where each
+ * class maintains a list of zspages where each zspage is divided
+ * into equal sized chunks. Each allocation falls into one of these
+ * classes depending on its size. This function returns index of the
+ * size class which has chunk size big enough to hold the give size.
+ */
+static int get_size_class_index(int size)
+{
+ int idx = 0;
+
+ if (likely(size > ZS_MIN_ALLOC_SIZE))
+ idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
+ ZS_SIZE_CLASS_DELTA);
+
+ return min_t(int, ZS_SIZE_CLASSES - 1, idx);
+}
+
+/* type can be of enum type zs_stat_type or fullness_group */
+static inline void zs_stat_inc(struct size_class *class,
+ int type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+/* type can be of enum type zs_stat_type or fullness_group */
+static inline void zs_stat_dec(struct size_class *class,
+ int type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+/* type can be of enum type zs_stat_type or fullness_group */
+static inline unsigned long zs_stat_get(struct size_class *class,
+ int type)
+{
+ return class->stats.objs[type];
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static void __init zs_stat_init(void)
+{
+ if (!debugfs_initialized()) {
+ pr_warn("debugfs not available, stat dir not created\n");
+ return;
+ }
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static unsigned long zs_can_compact(struct size_class *class);
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage", "freeable");
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = class->objs_per_zspage;
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage, freeable);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ total_freeable += freeable;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages, "", total_freeable);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
+
+static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
+{
+ if (!zs_stat_root) {
+ pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
+ return;
+ }
+
+ pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
+
+ debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
+ &zs_stats_size_fops);
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+static void __init zs_stat_init(void)
+{
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
+{
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+#endif
+
+
+/*
+ * For each size class, zspages are divided into different groups
+ * depending on how "full" they are. This was done so that we could
+ * easily find empty or nearly empty zspages when we try to shrink
+ * the pool (not yet implemented). This function returns fullness
+ * status of the given page.
+ */
+static enum fullness_group get_fullness_group(struct size_class *class,
+ struct zspage *zspage)
+{
+ int inuse, objs_per_zspage;
+ enum fullness_group fg;
+
+ inuse = get_zspage_inuse(zspage);
+ objs_per_zspage = class->objs_per_zspage;
+
+ if (inuse == 0)
+ fg = ZS_EMPTY;
+ else if (inuse == objs_per_zspage)
+ fg = ZS_FULL;
+ else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
+ fg = ZS_ALMOST_EMPTY;
+ else
+ fg = ZS_ALMOST_FULL;
+
+ return fg;
+}
+
+/*
+ * Each size class maintains various freelists and zspages are assigned
+ * to one of these freelists based on the number of live objects they
+ * have. This functions inserts the given zspage into the freelist
+ * identified by <class, fullness_group>.
+ */
+static void insert_zspage(struct size_class *class,
+ struct zspage *zspage,
+ enum fullness_group fullness)
+{
+ struct zspage *head;
+
+ zs_stat_inc(class, fullness, 1);
+ head = list_first_entry_or_null(&class->fullness_list[fullness],
+ struct zspage, list);
+ /*
+ * We want to see more ZS_FULL pages and less almost empty/full.
+ * Put pages with higher ->inuse first.
+ */
+ if (head) {
+ if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
+ list_add(&zspage->list, &head->list);
+ return;
+ }
+ }
+ list_add(&zspage->list, &class->fullness_list[fullness]);
+}
+
+/*
+ * This function removes the given zspage from the freelist identified
+ * by <class, fullness_group>.
+ */
+static void remove_zspage(struct size_class *class,
+ struct zspage *zspage,
+ enum fullness_group fullness)
+{
+ VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
+ VM_BUG_ON(is_zspage_isolated(zspage));
+
+ list_del_init(&zspage->list);
+ zs_stat_dec(class, fullness, 1);
+}
+
+/*
+ * Each size class maintains zspages in different fullness groups depending
+ * on the number of live objects they contain. When allocating or freeing
+ * objects, the fullness status of the page can change, say, from ALMOST_FULL
+ * to ALMOST_EMPTY when freeing an object. This function checks if such
+ * a status change has occurred for the given page and accordingly moves the
+ * page from the freelist of the old fullness group to that of the new
+ * fullness group.
+ */
+static enum fullness_group fix_fullness_group(struct size_class *class,
+ struct zspage *zspage)
+{
+ int class_idx;
+ enum fullness_group currfg, newfg;
+
+ get_zspage_mapping(zspage, &class_idx, &currfg);
+ newfg = get_fullness_group(class, zspage);
+ if (newfg == currfg)
+ goto out;
+
+ if (!is_zspage_isolated(zspage)) {
+ remove_zspage(class, zspage, currfg);
+ insert_zspage(class, zspage, newfg);
+ }
+
+ set_zspage_mapping(zspage, class_idx, newfg);
+
+out:
+ return newfg;
+}
+
+/*
+ * We have to decide on how many pages to link together
+ * to form a zspage for each size class. This is important
+ * to reduce wastage due to unusable space left at end of
+ * each zspage which is given as:
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
+ * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
+ *
+ * For example, for size class of 3/8 * PAGE_SIZE, we should
+ * link together 3 PAGE_SIZE sized pages to form a zspage
+ * since then we can perfectly fit in 8 such objects.
+ */
+static int get_pages_per_zspage(int class_size)
+{
+ int i, max_usedpc = 0;
+ /* zspage order which gives maximum used size per KB */
+ int max_usedpc_order = 1;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int zspage_size;
+ int waste, usedpc;
+
+ zspage_size = i * PAGE_SIZE;
+ waste = zspage_size % class_size;
+ usedpc = (zspage_size - waste) * 100 / zspage_size;
+
+ if (usedpc > max_usedpc) {
+ max_usedpc = usedpc;
+ max_usedpc_order = i;
+ }
+ }
+
+ return max_usedpc_order;
+}
+
+static struct zspage *get_zspage(struct page *page)
+{
+ struct zspage *zspage = (struct zspage *)page->private;
+
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
+ return zspage;
+}
+
+static struct page *get_next_page(struct page *page)
+{
+ if (unlikely(PageHugeObject(page)))
+ return NULL;
+
+ return page->freelist;
+}
+
+/**
+ * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @obj: the encoded object value
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static void obj_to_location(unsigned long obj, struct page **page,
+ unsigned int *obj_idx)
+{
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+/**
+ * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
+{
+ unsigned long obj;
+
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= obj_idx & OBJ_INDEX_MASK;
+ obj <<= OBJ_TAG_BITS;
+
+ return obj;
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct page *page, void *obj)
+{
+ if (unlikely(PageHugeObject(page))) {
+ VM_BUG_ON_PAGE(!is_first_page(page), page);
+ return page->index;
+ } else
+ return *(unsigned long *)obj;
+}
+
+static inline int testpin_tag(unsigned long handle)
+{
+ return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
+}
+
+static inline int trypin_tag(unsigned long handle)
+{
+ return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
+}
+
+static void pin_tag(unsigned long handle) __acquires(bitlock)
+{
+ bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+}
+
+static void unpin_tag(unsigned long handle) __releases(bitlock)
+{
+ bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+}
+
+static void reset_page(struct page *page)
+{
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_mapcount_reset(page);
+ ClearPageHugeObject(page);
+ page->freelist = NULL;
+}
+
+static int trylock_zspage(struct zspage *zspage)
+{
+ struct page *cursor, *fail;
+
+ for (cursor = get_first_page(zspage); cursor != NULL; cursor =
+ get_next_page(cursor)) {
+ if (!trylock_page(cursor)) {
+ fail = cursor;
+ goto unlock;
+ }
+ }
+
+ return 1;
+unlock:
+ for (cursor = get_first_page(zspage); cursor != fail; cursor =
+ get_next_page(cursor))
+ unlock_page(cursor);
+
+ return 0;
+}
+
+static void __free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ struct page *page, *next;
+ enum fullness_group fg;
+ unsigned int class_idx;
+
+ get_zspage_mapping(zspage, &class_idx, &fg);
+
+ assert_spin_locked(&class->lock);
+
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(fg != ZS_EMPTY);
+
+ next = page = get_first_page(zspage);
+ do {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ next = get_next_page(page);
+ reset_page(page);
+ unlock_page(page);
+ dec_zone_page_state(page, NR_ZSPAGES);
+ put_page(page);
+ page = next;
+ } while (page != NULL);
+
+ cache_free_zspage(pool, zspage);
+
+ zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+}
+
+static void free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(list_empty(&zspage->list));
+
+ if (!trylock_zspage(zspage)) {
+ kick_deferred_free(pool);
+ return;
+ }
+
+ remove_zspage(class, zspage, ZS_EMPTY);
+ __free_zspage(pool, class, zspage);
+}
+
+/* Initialize a newly allocated zspage */
+static void init_zspage(struct size_class *class, struct zspage *zspage)
+{
+ unsigned int freeobj = 1;
+ unsigned long off = 0;
+ struct page *page = get_first_page(zspage);
+
+ while (page) {
+ struct page *next_page;
+ struct link_free *link;
+ void *vaddr;
+
+ set_first_obj_offset(page, off);
+
+ vaddr = kmap_atomic(page);
+ link = (struct link_free *)vaddr + off / sizeof(*link);
+
+ while ((off += class->size) < PAGE_SIZE) {
+ link->next = freeobj++ << OBJ_TAG_BITS;
+ link += class->size / sizeof(*link);
+ }
+
+ /*
+ * We now come to the last (full or partial) object on this
+ * page, which must point to the first object on the next
+ * page (if present)
+ */
+ next_page = get_next_page(page);
+ if (next_page) {
+ link->next = freeobj++ << OBJ_TAG_BITS;
+ } else {
+ /*
+ * Reset OBJ_TAG_BITS bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ link->next = -1UL << OBJ_TAG_BITS;
+ }
+ kunmap_atomic(vaddr);
+ page = next_page;
+ off %= PAGE_SIZE;
+ }
+
+ set_freeobj(zspage, 0);
+}
+
+static void create_page_chain(struct size_class *class, struct zspage *zspage,
+ struct page *pages[])
+{
+ int i;
+ struct page *page;
+ struct page *prev_page = NULL;
+ int nr_pages = class->pages_per_zspage;
+
+ /*
+ * Allocate individual pages and link them together as:
+ * 1. all pages are linked together using page->freelist
+ * 2. each sub-page point to zspage using page->private
+ *
+ * we set PG_private to identify the first page (i.e. no other sub-page
+ * has this flag set).
+ */
+ for (i = 0; i < nr_pages; i++) {
+ page = pages[i];
+ set_page_private(page, (unsigned long)zspage);
+ page->freelist = NULL;
+ if (i == 0) {
+ zspage->first_page = page;
+ SetPagePrivate(page);
+ if (unlikely(class->objs_per_zspage == 1 &&
+ class->pages_per_zspage == 1))
+ SetPageHugeObject(page);
+ } else {
+ prev_page->freelist = page;
+ }
+ prev_page = page;
+ }
+}
+
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct zspage *alloc_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ gfp_t gfp)
+{
+ int i;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+
+ if (!zspage)
+ return NULL;
+
+ memset(zspage, 0, sizeof(struct zspage));
+ zspage->magic = ZSPAGE_MAGIC;
+ migrate_lock_init(zspage);
+
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
+
+ page = alloc_page(gfp);
+ if (!page) {
+ while (--i >= 0) {
+ dec_zone_page_state(pages[i], NR_ZSPAGES);
+ __free_page(pages[i]);
+ }
+ cache_free_zspage(pool, zspage);
+ return NULL;
+ }
+
+ inc_zone_page_state(page, NR_ZSPAGES);
+ pages[i] = page;
+ }
+
+ create_page_chain(class, zspage, pages);
+ init_zspage(class, zspage);
+
+ return zspage;
+}
+
+static struct zspage *find_get_zspage(struct size_class *class)
+{
+ int i;
+ struct zspage *zspage;
+
+ for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[i],
+ struct zspage, list);
+ if (zspage)
+ break;
+ }
+
+ return zspage;
+}
+
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm_buf)
+ return 0;
+ area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
+ if (!area->vm_buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ kfree(area->vm_buf);
+ area->vm_buf = NULL;
+}
+
+static void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* disable page faults to match kmap_atomic() return conditions */
+ pagefault_disable();
+
+ /* no read fastpath */
+ if (area->vm_mm == ZS_MM_WO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy object to per-cpu buffer */
+ addr = kmap_atomic(pages[0]);
+ memcpy(buf, addr + off, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(buf + sizes[0], addr, sizes[1]);
+ kunmap_atomic(addr);
+out:
+ return area->vm_buf;
+}
+
+static void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf;
+
+ /* no write fastpath */
+ if (area->vm_mm == ZS_MM_RO)
+ goto out;
+
+ buf = area->vm_buf;
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy per-cpu buffer to object */
+ addr = kmap_atomic(pages[0]);
+ memcpy(addr + off, buf, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(addr, buf + sizes[0], sizes[1]);
+ kunmap_atomic(addr);
+
+out:
+ /* enable page faults to match kunmap_atomic() return conditions */
+ pagefault_enable();
+}
+
+static int zs_cpu_prepare(unsigned int cpu)
+{
+ struct mapping_area *area;
+
+ area = &per_cpu(zs_map_area, cpu);
+ return __zs_cpu_up(area);
+}
+
+static int zs_cpu_dead(unsigned int cpu)
+{
+ struct mapping_area *area;
+
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ return 0;
+}
+
+static bool can_merge(struct size_class *prev, int pages_per_zspage,
+ int objs_per_zspage)
+{
+ if (prev->pages_per_zspage == pages_per_zspage &&
+ prev->objs_per_zspage == objs_per_zspage)
+ return true;
+
+ return false;
+}
+
+static bool zspage_full(struct size_class *class, struct zspage *zspage)
+{
+ return get_zspage_inuse(zspage) == class->objs_per_zspage;
+}
+
+unsigned long zs_get_total_pages(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->pages_allocated);
+}
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
+
+/**
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
+ * @mm: maping mode to use
+ *
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
+ *
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
+ */
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
+{
+ struct zspage *zspage;
+ struct page *page;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+ void *ret;
+
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
+
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ zspage = get_zspage(page);
+
+ /* migration cannot move any subpage in this zspage */
+ migrate_read_lock(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = (class->size * obj_idx) & ~PAGE_MASK;
+
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ ret = area->vm_addr + off;
+ goto out;
+ }
+
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (likely(!PageHugeObject(page)))
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(zs_map_object);
+
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+{
+ struct zspage *zspage;
+ struct page *page;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = (class->size * obj_idx) & ~PAGE_MASK;
+
+ area = this_cpu_ptr(&zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
+ }
+ put_cpu_var(zs_map_area);
+
+ migrate_read_unlock(zspage);
+ unpin_tag(handle);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+/**
+ * zs_huge_class_size() - Returns the size (in bytes) of the first huge
+ * zsmalloc &size_class.
+ * @pool: zsmalloc pool to use
+ *
+ * The function returns the size of the first huge class - any object of equal
+ * or bigger size will be stored in zspage consisting of a single physical
+ * page.
+ *
+ * Context: Any context.
+ *
+ * Return: the size (in bytes) of the first huge zsmalloc &size_class.
+ */
+size_t zs_huge_class_size(struct zs_pool *pool)
+{
+ return huge_class_size;
+}
+EXPORT_SYMBOL_GPL(zs_huge_class_size);
+
+static unsigned long obj_malloc(struct size_class *class,
+ struct zspage *zspage, unsigned long handle)
+{
+ int i, nr_page, offset;
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = get_freeobj(zspage);
+
+ offset = obj * class->size;
+ nr_page = offset >> PAGE_SHIFT;
+ m_offset = offset & ~PAGE_MASK;
+ m_page = get_first_page(zspage);
+
+ for (i = 0; i < nr_page; i++)
+ m_page = get_next_page(m_page);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
+ if (likely(!PageHugeObject(m_page)))
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle to page->index */
+ zspage->first_page->index = handle;
+
+ kunmap_atomic(vaddr);
+ mod_zspage_inuse(zspage, 1);
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ obj = location_to_obj(m_page, obj);
+
+ return obj;
+}
+
+
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ * @gfp: gfp flags when allocating object
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+{
+ unsigned long handle, obj;
+ struct size_class *class;
+ enum fullness_group newfg;
+ struct zspage *zspage;
+
+ if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+ return 0;
+
+ handle = cache_alloc_handle(pool, gfp);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
+ class = pool->size_class[get_size_class_index(size)];
+
+ spin_lock(&class->lock);
+ zspage = find_get_zspage(class);
+ if (likely(zspage)) {
+ obj = obj_malloc(class, zspage, handle);
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(class, zspage);
+ record_obj(handle, obj);
+ spin_unlock(&class->lock);
+
+ return handle;
+ }
+
+ spin_unlock(&class->lock);
+
+ zspage = alloc_zspage(pool, class, gfp);
+ if (!zspage) {
+ cache_free_handle(pool, handle);
+ return 0;
+ }
+
+ spin_lock(&class->lock);
+ obj = obj_malloc(class, zspage, handle);
+ newfg = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, newfg);
+ set_zspage_mapping(zspage, class->index, newfg);
+ record_obj(handle, obj);
+ atomic_long_add(class->pages_per_zspage,
+ &pool->pages_allocated);
+ zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
+
+ /* We completely set up zspage so mark them as movable */
+ SetZsPageMovable(pool, zspage);
+ spin_unlock(&class->lock);
+
+ return handle;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+
+static void obj_free(struct size_class *class, unsigned long obj)
+{
+ struct link_free *link;
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long f_offset;
+ unsigned int f_objidx;
+ void *vaddr;
+
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
+ f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+ zspage = get_zspage(f_page);
+
+ vaddr = kmap_atomic(f_page);
+
+ /* Insert this object in containing zspage's freelist */
+ link = (struct link_free *)(vaddr + f_offset);
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ kunmap_atomic(vaddr);
+ set_freeobj(zspage, f_objidx);
+ mod_zspage_inuse(zspage, -1);
+ zs_stat_dec(class, OBJ_USED, 1);
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long obj;
+ unsigned int f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+ bool isolated;
+
+ if (unlikely(!handle))
+ return;
+
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ zspage = get_zspage(f_page);
+
+ migrate_read_lock(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(class, obj);
+ fullness = fix_fullness_group(class, zspage);
+ if (fullness != ZS_EMPTY) {
+ migrate_read_unlock(zspage);
+ goto out;
+ }
+
+ isolated = is_zspage_isolated(zspage);
+ migrate_read_unlock(zspage);
+ /* If zspage is isolated, zs_page_putback will free the zspage */
+ if (likely(!isolated))
+ free_zspage(pool, class, zspage);
+out:
+
+ spin_unlock(&class->lock);
+ unpin_tag(handle);
+ cache_free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+ unsigned long src)
+{
+ struct page *s_page, *d_page;
+ unsigned int s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = (class->size * s_objidx) & ~PAGE_MASK;
+ d_off = (class->size * d_objidx) & ~PAGE_MASK;
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int *obj_idx)
+{
+ unsigned long head;
+ int offset = 0;
+ int index = *obj_idx;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ offset = get_first_obj_offset(page);
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+
+ *obj_idx = index;
+
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source spage for migration which could be a subpage of zspage */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int obj_idx;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ int obj_idx = cc->obj_idx;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(class, s_page, &obj_idx);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(class, get_zspage(d_page))) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(class, get_zspage(d_page), handle);
+ zs_object_copy(class, free_obj, used_obj);
+ obj_idx++;
+ /*
+ * record_obj updates handle's value to free_obj and it will
+ * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
+ * breaks synchronization using pin_tag(e,g, zs_free) so
+ * let's keep the lock bit.
+ */
+ free_obj |= BIT(HANDLE_PIN_BIT);
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(class, used_obj);
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->obj_idx = obj_idx;
+
+ return ret;
+}
+
+static struct zspage *isolate_zspage(struct size_class *class, bool source)
+{
+ int i;
+ struct zspage *zspage;
+ enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
+
+ if (!source) {
+ fg[0] = ZS_ALMOST_FULL;
+ fg[1] = ZS_ALMOST_EMPTY;
+ }
+
+ for (i = 0; i < 2; i++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
+ struct zspage, list);
+ if (zspage) {
+ VM_BUG_ON(is_zspage_isolated(zspage));
+ remove_zspage(class, zspage, fg[i]);
+ return zspage;
+ }
+ }
+
+ return zspage;
+}
+
+/*
+ * putback_zspage - add @zspage into right class's fullness list
+ * @class: destination class
+ * @zspage: target page
+ *
+ * Return @zspage's fullness_group
+ */
+static enum fullness_group putback_zspage(struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fullness;
+
+ VM_BUG_ON(is_zspage_isolated(zspage));
+
+ fullness = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, fullness);
+ set_zspage_mapping(zspage, class->index, fullness);
+
+ return fullness;
+}
+
+#ifdef CONFIG_COMPACTION
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+static void lock_zspage(struct zspage *zspage)
+{
+ struct page *curr_page, *page;
+
+ /*
+ * Pages we haven't locked yet can be migrated off the list while we're
+ * trying to lock them, so we need to be careful and only attempt to
+ * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * may no longer belong to the zspage. This means that we may wait for
+ * the wrong page to unlock, so we must take a reference to the page
+ * prior to waiting for it to unlock outside migrate_read_lock().
+ */
+ while (1) {
+ migrate_read_lock(zspage);
+ page = get_first_page(zspage);
+ if (trylock_page(page))
+ break;
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ }
+
+ curr_page = page;
+ while ((page = get_next_page(curr_page))) {
+ if (trylock_page(page)) {
+ curr_page = page;
+ } else {
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ migrate_read_lock(zspage);
+ }
+ }
+ migrate_read_unlock(zspage);
+}
+
+static int zs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type zsmalloc_fs = {
+ .name = "zsmalloc",
+ .init_fs_context = zs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int zsmalloc_mount(void)
+{
+ int ret = 0;
+
+ zsmalloc_mnt = kern_mount(&zsmalloc_fs);
+ if (IS_ERR(zsmalloc_mnt))
+ ret = PTR_ERR(zsmalloc_mnt);
+
+ return ret;
+}
+
+static void zsmalloc_unmount(void)
+{
+ kern_unmount(zsmalloc_mnt);
+}
+
+static void migrate_lock_init(struct zspage *zspage)
+{
+ rwlock_init(&zspage->lock);
+}
+
+static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
+{
+ read_lock(&zspage->lock);
+}
+
+static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
+{
+ read_unlock(&zspage->lock);
+}
+
+static void migrate_write_lock(struct zspage *zspage)
+{
+ write_lock(&zspage->lock);
+}
+
+static void migrate_write_unlock(struct zspage *zspage)
+{
+ write_unlock(&zspage->lock);
+}
+
+/* Number of isolated subpage for *page migration* in this zspage */
+static void inc_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated++;
+}
+
+static void dec_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated--;
+}
+
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * Checking pool->destroying must happen after atomic_long_dec()
+ * for pool->isolated_pages above. Paired with the smp_mb() in
+ * zs_unregister_migration().
+ */
+ smp_mb__after_atomic();
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
+static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+ struct page *newpage, struct page *oldpage)
+{
+ struct page *page;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ int idx = 0;
+
+ page = get_first_page(zspage);
+ do {
+ if (page == oldpage)
+ pages[idx] = newpage;
+ else
+ pages[idx] = page;
+ idx++;
+ } while ((page = get_next_page(page)) != NULL);
+
+ create_page_chain(class, zspage, pages);
+ set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ if (unlikely(PageHugeObject(oldpage)))
+ newpage->index = oldpage->index;
+ __SetPageMovable(newpage, page_mapping(oldpage));
+}
+
+static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct address_space *mapping;
+
+ /*
+ * Page is locked so zspage couldn't be destroyed. For detail, look at
+ * lock_zspage in free_zspage.
+ */
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /*
+ * Without class lock, fullness could be stale while class_idx is okay
+ * because class_idx is constant unless page is freed so we should get
+ * fullness again under class lock.
+ */
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ if (get_zspage_inuse(zspage) == 0) {
+ spin_unlock(&class->lock);
+ return false;
+ }
+
+ /* zspage is isolated for object migration */
+ if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ spin_unlock(&class->lock);
+ return false;
+ }
+
+ /*
+ * If this is first time isolation for the zspage, isolate zspage from
+ * size_class to prevent further object allocation from the zspage.
+ */
+ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
+ remove_zspage(class, zspage, fullness);
+ }
+
+ inc_zspage_isolation(zspage);
+ spin_unlock(&class->lock);
+
+ return true;
+}
+
+static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct page *dummy;
+ void *s_addr, *d_addr, *addr;
+ int offset, pos;
+ unsigned long handle, head;
+ unsigned long old_obj, new_obj;
+ unsigned int obj_idx;
+ int ret = -EAGAIN;
+
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the zs lock, which does not work with
+ * MIGRATE_SYNC_NO_COPY workflow.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /* Concurrent compactor cannot migrate any subpage in zspage */
+ migrate_write_lock(zspage);
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+ offset = get_first_obj_offset(page);
+
+ spin_lock(&class->lock);
+ if (!get_zspage_inuse(zspage)) {
+ /*
+ * Set "offset" to end of the page so that every loops
+ * skips unnecessary object scanning.
+ */
+ offset = PAGE_SIZE;
+ }
+
+ pos = offset;
+ s_addr = kmap_atomic(page);
+ while (pos < PAGE_SIZE) {
+ head = obj_to_head(page, s_addr + pos);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!trypin_tag(handle))
+ goto unpin_objects;
+ }
+ pos += class->size;
+ }
+
+ /*
+ * Here, any user cannot access all objects in the zspage so let's move.
+ */
+ d_addr = kmap_atomic(newpage);
+ memcpy(d_addr, s_addr, PAGE_SIZE);
+ kunmap_atomic(d_addr);
+
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+
+ old_obj = handle_to_obj(handle);
+ obj_to_location(old_obj, &dummy, &obj_idx);
+ new_obj = (unsigned long)location_to_obj(newpage,
+ obj_idx);
+ new_obj |= BIT(HANDLE_PIN_BIT);
+ record_obj(handle, new_obj);
+ }
+ }
+
+ replace_sub_page(class, zspage, newpage, page);
+ get_page(newpage);
+
+ dec_zspage_isolation(zspage);
+
+ /*
+ * Page migration is done so let's putback isolated zspage to
+ * the list if @page is final isolated subpage in the zspage.
+ */
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
+
+ if (page_zone(newpage) != page_zone(page)) {
+ dec_zone_page_state(page, NR_ZSPAGES);
+ inc_zone_page_state(newpage, NR_ZSPAGES);
+ }
+
+ reset_page(page);
+ put_page(page);
+ page = newpage;
+
+ ret = MIGRATEPAGE_SUCCESS;
+unpin_objects:
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+ unpin_tag(handle);
+ }
+ }
+ kunmap_atomic(s_addr);
+ spin_unlock(&class->lock);
+ migrate_write_unlock(zspage);
+
+ return ret;
+}
+
+static void zs_page_putback(struct page *page)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fg;
+ struct address_space *mapping;
+ struct zspage *zspage;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ dec_zspage_isolation(zspage);
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * Due to page_lock, we cannot free zspage immediately
+ * so let's defer.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
+ spin_unlock(&class->lock);
+}
+
+static const struct address_space_operations zsmalloc_aops = {
+ .isolate_page = zs_page_isolate,
+ .migratepage = zs_page_migrate,
+ .putback_page = zs_page_putback,
+};
+
+static int zs_register_migration(struct zs_pool *pool)
+{
+ pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &zsmalloc_aops;
+ return 0;
+}
+
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
+static void zs_unregister_migration(struct zs_pool *pool)
+{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
+ flush_work(&pool->free_work);
+ iput(pool->inode);
+}
+
+/*
+ * Caller should hold page_lock of all pages in the zspage
+ * In here, we cannot use zspage meta data.
+ */
+static void async_free_zspage(struct work_struct *work)
+{
+ int i;
+ struct size_class *class;
+ unsigned int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage, *tmp;
+ LIST_HEAD(free_pages);
+ struct zs_pool *pool = container_of(work, struct zs_pool,
+ free_work);
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ class = pool->size_class[i];
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+ spin_unlock(&class->lock);
+ }
+
+
+ list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
+ list_del(&zspage->list);
+ lock_zspage(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ VM_BUG_ON(fullness != ZS_EMPTY);
+ class = pool->size_class[class_idx];
+ spin_lock(&class->lock);
+ __free_zspage(pool, pool->size_class[class_idx], zspage);
+ spin_unlock(&class->lock);
+ }
+};
+
+static void kick_deferred_free(struct zs_pool *pool)
+{
+ schedule_work(&pool->free_work);
+}
+
+static void init_deferred_free(struct zs_pool *pool)
+{
+ INIT_WORK(&pool->free_work, async_free_zspage);
+}
+
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+#endif
+
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+ unsigned long obj_wasted;
+ unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, OBJ_USED);
+
+ if (obj_allocated <= obj_used)
+ return 0;
+
+ obj_wasted = obj_allocated - obj_used;
+ obj_wasted /= class->objs_per_zspage;
+
+ return obj_wasted * class->pages_per_zspage;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ struct zs_compact_control cc;
+ struct zspage *src_zspage;
+ struct zspage *dst_zspage = NULL;
+ unsigned long pages_freed = 0;
+
+ spin_lock(&class->lock);
+ while ((src_zspage = isolate_zspage(class, true))) {
+
+ if (!zs_can_compact(class))
+ break;
+
+ cc.obj_idx = 0;
+ cc.s_page = get_first_page(src_zspage);
+
+ while ((dst_zspage = isolate_zspage(class, false))) {
+ cc.d_page = get_first_page(dst_zspage);
+ /*
+ * If there is no more space in dst_page, resched
+ * and see if anyone had allocated another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(class, dst_zspage);
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_zspage == NULL)
+ break;
+
+ putback_zspage(class, dst_zspage);
+ if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
+ free_zspage(pool, class, src_zspage);
+ pages_freed += class->pages_per_zspage;
+ }
+ spin_unlock(&class->lock);
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_zspage)
+ putback_zspage(class, src_zspage);
+
+ spin_unlock(&class->lock);
+
+ return pages_freed;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_freed = 0;
+
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ pages_freed += __zs_compact(pool, class);
+ }
+ atomic_long_add(pages_freed, &pool->stats.pages_compacted);
+
+ return pages_freed;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
+
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+ memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_freed;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ /*
+ * Compact classes and calculate compaction delta.
+ * Can run concurrently with a manually triggered
+ * (by user) compaction.
+ */
+ pages_freed = zs_compact(pool);
+
+ return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_to_free = 0;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+
+ pages_to_free += zs_can_compact(class);
+ }
+
+ return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+ unregister_shrinker(&pool->shrinker);
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+ pool->shrinker.scan_objects = zs_shrinker_scan;
+ pool->shrinker.count_objects = zs_shrinker_count;
+ pool->shrinker.batch = 0;
+ pool->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&pool->shrinker);
+}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @name: pool name to be created
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(const char *name)
+{
+ int i;
+ struct zs_pool *pool;
+ struct size_class *prev_class = NULL;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
+
+ init_deferred_free(pool);
+
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pool->migration_wait);
+#endif
+
+ if (create_cache(pool))
+ goto err;
+
+ /*
+ * Iterate reversely, because, size of size_class that we want to use
+ * for merging should be larger or equal to current size.
+ */
+ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
+ int size;
+ int pages_per_zspage;
+ int objs_per_zspage;
+ struct size_class *class;
+ int fullness = 0;
+
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+ pages_per_zspage = get_pages_per_zspage(size);
+ objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
+
+ /*
+ * We iterate from biggest down to smallest classes,
+ * so huge_class_size holds the size of the first huge
+ * class. Any object bigger than or equal to that will
+ * endup in the huge class.
+ */
+ if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
+ !huge_class_size) {
+ huge_class_size = size;
+ /*
+ * The object uses ZS_HANDLE_SIZE bytes to store the
+ * handle. We need to subtract it, because zs_malloc()
+ * unconditionally adds handle size before it performs
+ * size class search - so object may be smaller than
+ * huge class size, yet it still can end up in the huge
+ * class because it grows by ZS_HANDLE_SIZE extra bytes
+ * right before class lookup.
+ */
+ huge_class_size -= (ZS_HANDLE_SIZE - 1);
+ }
+
+ /*
+ * size_class is used for normal zsmalloc operation such
+ * as alloc/free for that size. Although it is natural that we
+ * have one size_class for each size, there is a chance that we
+ * can get more memory utilization if we use one size_class for
+ * many different sizes whose size_class have same
+ * characteristics. So, we makes size_class point to
+ * previous size_class if possible.
+ */
+ if (prev_class) {
+ if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
+ pool->size_class[i] = prev_class;
+ continue;
+ }
+ }
+
+ class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+ if (!class)
+ goto err;
+
+ class->size = size;
+ class->index = i;
+ class->pages_per_zspage = pages_per_zspage;
+ class->objs_per_zspage = objs_per_zspage;
+ spin_lock_init(&class->lock);
+ pool->size_class[i] = class;
+ for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
+ fullness++)
+ INIT_LIST_HEAD(&class->fullness_list[fullness]);
+
+ prev_class = class;
+ }
+
+ /* debug only, don't abort if it fails */
+ zs_pool_stat_create(pool, name);
+
+ if (zs_register_migration(pool))
+ goto err;
+
+ /*
+ * Not critical since shrinker is only used to trigger internal
+ * defragmentation of the pool which is pretty optional thing. If
+ * registration fails we still can use the pool normally and user can
+ * trigger compaction manually. Thus, ignore return code.
+ */
+ zs_register_shrinker(pool);
+
+ return pool;
+
+err:
+ zs_destroy_pool(pool);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+
+void zs_destroy_pool(struct zs_pool *pool)
+{
+ int i;
+
+ zs_unregister_shrinker(pool);
+ zs_unregister_migration(pool);
+ zs_pool_stat_destroy(pool);
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int fg;
+ struct size_class *class = pool->size_class[i];
+
+ if (!class)
+ continue;
+
+ if (class->index != i)
+ continue;
+
+ for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
+ if (!list_empty(&class->fullness_list[fg])) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ kfree(class);
+ }
+
+ destroy_cache(pool);
+ kfree(pool->name);
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+static int __init zs_init(void)
+{
+ int ret;
+
+ ret = zsmalloc_mount();
+ if (ret)
+ goto out;
+
+ ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
+ zs_cpu_prepare, zs_cpu_dead);
+ if (ret)
+ goto hp_setup_fail;
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zs_zpool_driver);
+#endif
+
+ zs_stat_init();
+
+ return 0;
+
+hp_setup_fail:
+ zsmalloc_unmount();
+out:
+ return ret;
+}
+
+static void __exit zs_exit(void)
+{
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+ zsmalloc_unmount();
+ cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
+
+ zs_stat_exit();
+}
+
+module_init(zs_init);
+module_exit(zs_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000..fbb782924
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,1379 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * zswap.c - zswap driver file
+ *
+ * zswap is a backend for frontswap that takes pages that are in the process
+ * of being swapped out and attempts to compress and store them in a
+ * RAM-based memory pool. This can result in a significant I/O reduction on
+ * the swap device and, in the case where decompressing from RAM is faster
+ * than reading from the swap device, can also improve workload performance.
+ *
+ * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/frontswap.h>
+#include <linux/rbtree.h>
+#include <linux/swap.h>
+#include <linux/crypto.h>
+#include <linux/mempool.h>
+#include <linux/zpool.h>
+
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/workqueue.h>
+
+/*********************************
+* statistics
+**********************************/
+/* Total bytes used by the compressed storage */
+static u64 zswap_pool_total_size;
+/* The number of compressed pages currently stored in zswap */
+static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/* The number of same-value filled pages currently stored in zswap */
+static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
+
+/*
+ * The statistics below are not protected from concurrent access for
+ * performance reasons so they may not be a 100% accurate. However,
+ * they do provide useful information on roughly how many times a
+ * certain event is occurring.
+*/
+
+/* Pool limit was hit (see zswap_max_pool_percent) */
+static u64 zswap_pool_limit_hit;
+/* Pages written back when pool limit was reached */
+static u64 zswap_written_back_pages;
+/* Store failed due to a reclaim failure after pool limit was reached */
+static u64 zswap_reject_reclaim_fail;
+/* Compressed page was too big for the allocator to (optimally) store */
+static u64 zswap_reject_compress_poor;
+/* Store failed because underlying allocator could not get memory */
+static u64 zswap_reject_alloc_fail;
+/* Store failed because the entry metadata could not be allocated (rare) */
+static u64 zswap_reject_kmemcache_fail;
+/* Duplicate store was encountered (rare) */
+static u64 zswap_duplicate_entry;
+
+/* Shrinker work queue */
+static struct workqueue_struct *shrink_wq;
+/* Pool limit was hit, we need to calm down */
+static bool zswap_pool_reached_full;
+
+/*********************************
+* tunables
+**********************************/
+
+#define ZSWAP_PARAM_UNSET ""
+
+/* Enable/disable zswap */
+static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
+static int zswap_enabled_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_enabled_param_ops = {
+ .set = zswap_enabled_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
+
+/* Crypto compressor to use */
+static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
+static int zswap_compressor_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_compressor_param_ops = {
+ .set = zswap_compressor_param_set,
+ .get = param_get_charp,
+ .free = param_free_charp,
+};
+module_param_cb(compressor, &zswap_compressor_param_ops,
+ &zswap_compressor, 0644);
+
+/* Compressed storage zpool to use */
+static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
+static int zswap_zpool_param_set(const char *, const struct kernel_param *);
+static struct kernel_param_ops zswap_zpool_param_ops = {
+ .set = zswap_zpool_param_set,
+ .get = param_get_charp,
+ .free = param_free_charp,
+};
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
+
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
+
+/* The threshold for accepting new pages after the max_pool_percent was hit */
+static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
+module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
+ uint, 0644);
+
+/* Enable/disable handling same-value filled pages (enabled by default) */
+static bool zswap_same_filled_pages_enabled = true;
+module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
+ bool, 0644);
+
+/*********************************
+* data structures
+**********************************/
+
+struct zswap_pool {
+ struct zpool *zpool;
+ struct crypto_comp * __percpu *tfm;
+ struct kref kref;
+ struct list_head list;
+ struct work_struct release_work;
+ struct work_struct shrink_work;
+ struct hlist_node node;
+ char tfm_name[CRYPTO_MAX_ALG_NAME];
+};
+
+/*
+ * struct zswap_entry
+ *
+ * This structure contains the metadata for tracking a single compressed
+ * page within zswap.
+ *
+ * rbnode - links the entry into red-black tree for the appropriate swap type
+ * offset - the swap offset for the entry. Index into the red-black tree.
+ * refcount - the number of outstanding reference to the entry. This is needed
+ * to protect against premature freeing of the entry by code
+ * concurrent calls to load, invalidate, and writeback. The lock
+ * for the zswap_tree structure that contains the entry must
+ * be held while changing the refcount. Since the lock must
+ * be held, there is no reason to also make refcount atomic.
+ * length - the length in bytes of the compressed page data. Needed during
+ * decompression. For a same value filled page length is 0.
+ * pool - the zswap_pool the entry's data is in
+ * handle - zpool allocation handle that stores the compressed page data
+ * value - value of the same-value filled pages which have same content
+ */
+struct zswap_entry {
+ struct rb_node rbnode;
+ pgoff_t offset;
+ int refcount;
+ unsigned int length;
+ struct zswap_pool *pool;
+ union {
+ unsigned long handle;
+ unsigned long value;
+ };
+};
+
+struct zswap_header {
+ swp_entry_t swpentry;
+};
+
+/*
+ * The tree lock in the zswap_tree struct protects a few things:
+ * - the rbtree
+ * - the refcount field of each entry in the tree
+ */
+struct zswap_tree {
+ struct rb_root rbroot;
+ spinlock_t lock;
+};
+
+static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+
+/* RCU-protected iteration */
+static LIST_HEAD(zswap_pools);
+/* protects zswap_pools list modification */
+static DEFINE_SPINLOCK(zswap_pools_lock);
+/* pool counter to provide unique names to zpool */
+static atomic_t zswap_pools_count = ATOMIC_INIT(0);
+
+/* used by param callback function */
+static bool zswap_init_started;
+
+/* fatal error during init */
+static bool zswap_init_failed;
+
+/* init completed, but couldn't create the initial pool */
+static bool zswap_has_pool;
+
+/*********************************
+* helpers and fwd declarations
+**********************************/
+
+#define zswap_pool_debug(msg, p) \
+ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
+ zpool_get_type((p)->zpool))
+
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_pool_get(struct zswap_pool *pool);
+static void zswap_pool_put(struct zswap_pool *pool);
+
+static const struct zpool_ops zswap_zpool_ops = {
+ .evict = zswap_writeback_entry
+};
+
+static bool zswap_is_full(void)
+{
+ return totalram_pages() * zswap_max_pool_percent / 100 <
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static bool zswap_can_accept(void)
+{
+ return totalram_pages() * zswap_accept_thr_percent / 100 *
+ zswap_max_pool_percent / 100 >
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static void zswap_update_total_size(void)
+{
+ struct zswap_pool *pool;
+ u64 total = 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ total += zpool_get_total_size(pool->zpool);
+
+ rcu_read_unlock();
+
+ zswap_pool_total_size = total;
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static int __init zswap_entry_cache_create(void)
+{
+ zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
+ return zswap_entry_cache == NULL;
+}
+
+static void __init zswap_entry_cache_destroy(void)
+{
+ kmem_cache_destroy(zswap_entry_cache);
+}
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+{
+ struct zswap_entry *entry;
+ entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+ if (!entry)
+ return NULL;
+ entry->refcount = 1;
+ RB_CLEAR_NODE(&entry->rbnode);
+ return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+ kmem_cache_free(zswap_entry_cache, entry);
+}
+
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct zswap_entry *entry;
+
+ while (node) {
+ entry = rb_entry(node, struct zswap_entry, rbnode);
+ if (entry->offset > offset)
+ node = node->rb_left;
+ else if (entry->offset < offset)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+ struct zswap_entry **dupentry)
+{
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ struct zswap_entry *myentry;
+
+ while (*link) {
+ parent = *link;
+ myentry = rb_entry(parent, struct zswap_entry, rbnode);
+ if (myentry->offset > entry->offset)
+ link = &(*link)->rb_left;
+ else if (myentry->offset < entry->offset)
+ link = &(*link)->rb_right;
+ else {
+ *dupentry = myentry;
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&entry->rbnode, parent, link);
+ rb_insert_color(&entry->rbnode, root);
+ return 0;
+}
+
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ if (!RB_EMPTY_NODE(&entry->rbnode)) {
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+ }
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_entry *entry)
+{
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
+ }
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_update_total_size();
+}
+
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+ entry->refcount++;
+}
+
+/* caller must hold the tree lock
+* remove from the tree and free it, if nobody reference the entry
+*/
+static void zswap_entry_put(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ int refcount = --entry->refcount;
+
+ BUG_ON(refcount < 0);
+ if (refcount == 0) {
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_free_entry(entry);
+ }
+}
+
+/* caller must hold the tree lock */
+static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+ pgoff_t offset)
+{
+ struct zswap_entry *entry;
+
+ entry = zswap_rb_search(root, offset);
+ if (entry)
+ zswap_entry_get(entry);
+
+ return entry;
+}
+
+/*********************************
+* per-cpu code
+**********************************/
+static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+
+static int zswap_dstmem_prepare(unsigned int cpu)
+{
+ u8 *dst;
+
+ dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!dst)
+ return -ENOMEM;
+
+ per_cpu(zswap_dstmem, cpu) = dst;
+ return 0;
+}
+
+static int zswap_dstmem_dead(unsigned int cpu)
+{
+ u8 *dst;
+
+ dst = per_cpu(zswap_dstmem, cpu);
+ kfree(dst);
+ per_cpu(zswap_dstmem, cpu) = NULL;
+
+ return 0;
+}
+
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_comp *tfm;
+
+ if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+ return 0;
+
+ tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+ if (IS_ERR_OR_NULL(tfm)) {
+ pr_err("could not alloc crypto comp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(tfm));
+ return -ENOMEM;
+ }
+ *per_cpu_ptr(pool->tfm, cpu) = tfm;
+ return 0;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_comp *tfm;
+
+ tfm = *per_cpu_ptr(pool->tfm, cpu);
+ if (!IS_ERR_OR_NULL(tfm))
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(pool->tfm, cpu) = NULL;
+ return 0;
+}
+
+/*********************************
+* pool functions
+**********************************/
+
+static struct zswap_pool *__zswap_pool_current(void)
+{
+ struct zswap_pool *pool;
+
+ pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+ WARN_ONCE(!pool && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+ assert_spin_locked(&zswap_pools_lock);
+
+ return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+ struct zswap_pool *pool;
+
+ rcu_read_lock();
+
+ pool = __zswap_pool_current();
+ if (!zswap_pool_get(pool))
+ pool = NULL;
+
+ rcu_read_unlock();
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+ struct zswap_pool *pool, *last = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list)
+ last = pool;
+ WARN_ONCE(!last && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+ if (!zswap_pool_get(last))
+ last = NULL;
+
+ rcu_read_unlock();
+
+ return last;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+
+ assert_spin_locked(&zswap_pools_lock);
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ if (strcmp(pool->tfm_name, compressor))
+ continue;
+ if (strcmp(zpool_get_type(pool->zpool), type))
+ continue;
+ /* if we can't get it, it's about to be destroyed */
+ if (!zswap_pool_get(pool))
+ continue;
+ return pool;
+ }
+
+ return NULL;
+}
+
+static void shrink_worker(struct work_struct *w)
+{
+ struct zswap_pool *pool = container_of(w, typeof(*pool),
+ shrink_work);
+
+ if (zpool_shrink(pool->zpool, 1, NULL))
+ zswap_reject_reclaim_fail++;
+ zswap_pool_put(pool);
+}
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+ char name[38]; /* 'zswap' + 32 char (max) num + \0 */
+ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ int ret;
+
+ if (!zswap_has_pool) {
+ /* if either are unset, pool initialization failed, and we
+ * need both params to be set correctly before trying to
+ * create a pool.
+ */
+ if (!strcmp(type, ZSWAP_PARAM_UNSET))
+ return NULL;
+ if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
+ return NULL;
+ }
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
+
+ /* unique name for each pool specifically required by zsmalloc */
+ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+
+ pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
+ if (!pool->zpool) {
+ pr_err("%s zpool not available\n", type);
+ goto error;
+ }
+ pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+
+ strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+ pool->tfm = alloc_percpu(struct crypto_comp *);
+ if (!pool->tfm) {
+ pr_err("percpu alloc failed\n");
+ goto error;
+ }
+
+ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
+ &pool->node);
+ if (ret)
+ goto error;
+ pr_debug("using %s compressor\n", pool->tfm_name);
+
+ /* being the current pool takes 1 ref; this func expects the
+ * caller to always add the new pool as the current pool
+ */
+ kref_init(&pool->kref);
+ INIT_LIST_HEAD(&pool->list);
+ INIT_WORK(&pool->shrink_work, shrink_worker);
+
+ zswap_pool_debug("created", pool);
+
+ return pool;
+
+error:
+ free_percpu(pool->tfm);
+ if (pool->zpool)
+ zpool_destroy_pool(pool->zpool);
+ kfree(pool);
+ return NULL;
+}
+
+static __init struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+ bool has_comp, has_zpool;
+
+ has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ if (!has_comp && strcmp(zswap_compressor,
+ CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
+ pr_err("compressor %s not available, using default %s\n",
+ zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
+ param_free_charp(&zswap_compressor);
+ zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
+ has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ }
+ if (!has_comp) {
+ pr_err("default compressor %s not available\n",
+ zswap_compressor);
+ param_free_charp(&zswap_compressor);
+ zswap_compressor = ZSWAP_PARAM_UNSET;
+ }
+
+ has_zpool = zpool_has_pool(zswap_zpool_type);
+ if (!has_zpool && strcmp(zswap_zpool_type,
+ CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
+ pr_err("zpool %s not available, using default %s\n",
+ zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
+ param_free_charp(&zswap_zpool_type);
+ zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
+ has_zpool = zpool_has_pool(zswap_zpool_type);
+ }
+ if (!has_zpool) {
+ pr_err("default zpool %s not available\n",
+ zswap_zpool_type);
+ param_free_charp(&zswap_zpool_type);
+ zswap_zpool_type = ZSWAP_PARAM_UNSET;
+ }
+
+ if (!has_comp || !has_zpool)
+ return NULL;
+
+ return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+ zswap_pool_debug("destroying", pool);
+
+ cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+ free_percpu(pool->tfm);
+ zpool_destroy_pool(pool->zpool);
+ kfree(pool);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+ if (!pool)
+ return 0;
+
+ return kref_get_unless_zero(&pool->kref);
+}
+
+static void __zswap_pool_release(struct work_struct *work)
+{
+ struct zswap_pool *pool = container_of(work, typeof(*pool),
+ release_work);
+
+ synchronize_rcu();
+
+ /* nobody should have been able to get a kref... */
+ WARN_ON(kref_get_unless_zero(&pool->kref));
+
+ /* pool is now off zswap_pools list and has no references. */
+ zswap_pool_destroy(pool);
+}
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+ struct zswap_pool *pool;
+
+ pool = container_of(kref, typeof(*pool), kref);
+
+ spin_lock(&zswap_pools_lock);
+
+ WARN_ON(pool == zswap_pool_current());
+
+ list_del_rcu(&pool->list);
+
+ INIT_WORK(&pool->release_work, __zswap_pool_release);
+ schedule_work(&pool->release_work);
+
+ spin_unlock(&zswap_pools_lock);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+ kref_put(&pool->kref, __zswap_pool_empty);
+}
+
+/*********************************
+* param callbacks
+**********************************/
+
+/* val must be a null-terminated string */
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+ char *type, char *compressor)
+{
+ struct zswap_pool *pool, *put_pool = NULL;
+ char *s = strstrip((char *)val);
+ int ret;
+
+ if (zswap_init_failed) {
+ pr_err("can't set param, initialization failed\n");
+ return -ENODEV;
+ }
+
+ /* no change required */
+ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
+ return 0;
+
+ /* if this is load-time (pre-init) param setting,
+ * don't create a pool; that's done during init.
+ */
+ if (!zswap_init_started)
+ return param_set_charp(s, kp);
+
+ if (!type) {
+ if (!zpool_has_pool(s)) {
+ pr_err("zpool %s not available\n", s);
+ return -ENOENT;
+ }
+ type = s;
+ } else if (!compressor) {
+ if (!crypto_has_comp(s, 0, 0)) {
+ pr_err("compressor %s not available\n", s);
+ return -ENOENT;
+ }
+ compressor = s;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ spin_lock(&zswap_pools_lock);
+
+ pool = zswap_pool_find_get(type, compressor);
+ if (pool) {
+ zswap_pool_debug("using existing", pool);
+ WARN_ON(pool == zswap_pool_current());
+ list_del_rcu(&pool->list);
+ }
+
+ spin_unlock(&zswap_pools_lock);
+
+ if (!pool)
+ pool = zswap_pool_create(type, compressor);
+
+ if (pool)
+ ret = param_set_charp(s, kp);
+ else
+ ret = -EINVAL;
+
+ spin_lock(&zswap_pools_lock);
+
+ if (!ret) {
+ put_pool = zswap_pool_current();
+ list_add_rcu(&pool->list, &zswap_pools);
+ zswap_has_pool = true;
+ } else if (pool) {
+ /* add the possibly pre-existing pool to the end of the pools
+ * list; if it's new (and empty) then it'll be removed and
+ * destroyed by the put after we drop the lock
+ */
+ list_add_tail_rcu(&pool->list, &zswap_pools);
+ put_pool = pool;
+ }
+
+ spin_unlock(&zswap_pools_lock);
+
+ if (!zswap_has_pool && !pool) {
+ /* if initial pool creation failed, and this pool creation also
+ * failed, maybe both compressor and zpool params were bad.
+ * Allow changing this param, so pool creation will succeed
+ * when the other param is changed. We already verified this
+ * param is ok in the zpool_has_pool() or crypto_has_comp()
+ * checks above.
+ */
+ ret = param_set_charp(s, kp);
+ }
+
+ /* drop the ref from either the old current pool,
+ * or the new pool we failed to add
+ */
+ if (put_pool)
+ zswap_pool_put(put_pool);
+
+ return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ return __zswap_param_set(val, kp, NULL, zswap_compressor);
+}
+
+static int zswap_enabled_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ if (zswap_init_failed) {
+ pr_err("can't enable, initialization failed\n");
+ return -ENODEV;
+ }
+ if (!zswap_has_pool && zswap_init_started) {
+ pr_err("can't enable, no pool configured\n");
+ return -ENODEV;
+ }
+
+ return param_set_bool(val, kp);
+}
+
+/*********************************
+* writeback code
+**********************************/
+/* return enum for zswap_get_swap_cache_page */
+enum zswap_get_swap_ret {
+ ZSWAP_SWAPCACHE_NEW,
+ ZSWAP_SWAPCACHE_EXIST,
+ ZSWAP_SWAPCACHE_FAIL,
+};
+
+/*
+ * zswap_get_swap_cache_page
+ *
+ * This is an adaption of read_swap_cache_async()
+ *
+ * This function tries to find a page with the given swap entry
+ * in the swapper_space address space (the swap cache). If the page
+ * is found, it is returned in retpage. Otherwise, a page is allocated,
+ * added to the swap cache, and returned in retpage.
+ *
+ * If success, the swap cache page is returned in retpage
+ * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
+ * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
+ * the new page is added to swapcache and locked
+ * Returns ZSWAP_SWAPCACHE_FAIL on error
+ */
+static int zswap_get_swap_cache_page(swp_entry_t entry,
+ struct page **retpage)
+{
+ bool page_was_allocated;
+
+ *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+ NULL, 0, &page_was_allocated);
+ if (page_was_allocated)
+ return ZSWAP_SWAPCACHE_NEW;
+ if (!*retpage)
+ return ZSWAP_SWAPCACHE_FAIL;
+ return ZSWAP_SWAPCACHE_EXIST;
+}
+
+/*
+ * Attempts to free an entry by adding a page to the swap cache,
+ * decompressing the entry data into the page, and issuing a
+ * bio write to write the page back to the swap device.
+ *
+ * This can be thought of as a "resumed writeback" of the page
+ * to the swap device. We are basically resuming the same swap
+ * writeback path that was intercepted with the frontswap_store()
+ * in the first place. After the page has been decompressed into
+ * the swap cache, the compressed version stored by zswap can be
+ * freed.
+ */
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
+{
+ struct zswap_header *zhdr;
+ swp_entry_t swpentry;
+ struct zswap_tree *tree;
+ pgoff_t offset;
+ struct zswap_entry *entry;
+ struct page *page;
+ struct crypto_comp *tfm;
+ u8 *src, *dst;
+ unsigned int dlen;
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ /* extract swpentry from data */
+ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
+ swpentry = zhdr->swpentry; /* here */
+ tree = zswap_trees[swp_type(swpentry)];
+ offset = swp_offset(swpentry);
+
+ /* find and ref zswap entry */
+ spin_lock(&tree->lock);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was invalidated */
+ spin_unlock(&tree->lock);
+ zpool_unmap_handle(pool, handle);
+ return 0;
+ }
+ spin_unlock(&tree->lock);
+ BUG_ON(offset != entry->offset);
+
+ /* try to allocate swap cache page */
+ switch (zswap_get_swap_cache_page(swpentry, &page)) {
+ case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ ret = -ENOMEM;
+ goto fail;
+
+ case ZSWAP_SWAPCACHE_EXIST:
+ /* page is already in the swap cache, ignore for now */
+ put_page(page);
+ ret = -EEXIST;
+ goto fail;
+
+ case ZSWAP_SWAPCACHE_NEW: /* page is locked */
+ /* decompress */
+ dlen = PAGE_SIZE;
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
+ dst = kmap_atomic(page);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
+ ret = crypto_comp_decompress(tfm, src, entry->length,
+ dst, &dlen);
+ put_cpu_ptr(entry->pool->tfm);
+ kunmap_atomic(dst);
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+
+ /* page is up to date */
+ SetPageUptodate(page);
+ }
+
+ /* move it to the tail of the inactive list after end_writeback */
+ SetPageReclaim(page);
+
+ /* start writeback */
+ __swap_writepage(page, &wbc, end_swap_bio_write);
+ put_page(page);
+ zswap_written_back_pages++;
+
+ spin_lock(&tree->lock);
+ /* drop local reference */
+ zswap_entry_put(tree, entry);
+
+ /*
+ * There are two possible situations for entry here:
+ * (1) refcount is 1(normal case), entry is valid and on the tree
+ * (2) refcount is 0, entry is freed and not on the tree
+ * because invalidate happened during writeback
+ * search the tree and free the entry if find entry
+ */
+ if (entry == zswap_rb_search(&tree->rbroot, offset))
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+ goto end;
+
+ /*
+ * if we get here due to ZSWAP_SWAPCACHE_EXIST
+ * a load may happening concurrently
+ * it is safe and okay to not free the entry
+ * if we free the entry in the following put
+ * it it either okay to return !0
+ */
+fail:
+ spin_lock(&tree->lock);
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+end:
+ zpool_unmap_handle(pool, handle);
+ return ret;
+}
+
+static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos] != page[0])
+ return 0;
+ }
+ *value = page[0];
+ return 1;
+}
+
+static void zswap_fill_page(void *ptr, unsigned long value)
+{
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+ memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
+}
+
+/*********************************
+* frontswap hooks
+**********************************/
+/* attempts to compress and store an single page */
+static int zswap_frontswap_store(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry, *dupentry;
+ struct crypto_comp *tfm;
+ int ret;
+ unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned long handle, value;
+ char *buf;
+ u8 *src, *dst;
+ struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ gfp_t gfp;
+
+ /* THP isn't supported */
+ if (PageTransHuge(page)) {
+ ret = -EINVAL;
+ goto reject;
+ }
+
+ if (!zswap_enabled || !tree) {
+ ret = -ENODEV;
+ goto reject;
+ }
+
+ /* reclaim space if needed */
+ if (zswap_is_full()) {
+ struct zswap_pool *pool;
+
+ zswap_pool_limit_hit++;
+ zswap_pool_reached_full = true;
+ pool = zswap_pool_last_get();
+ if (pool)
+ queue_work(shrink_wq, &pool->shrink_work);
+ ret = -ENOMEM;
+ goto reject;
+ }
+
+ if (zswap_pool_reached_full) {
+ if (!zswap_can_accept()) {
+ ret = -ENOMEM;
+ goto reject;
+ } else
+ zswap_pool_reached_full = false;
+ }
+
+ /* allocate entry */
+ entry = zswap_entry_cache_alloc(GFP_KERNEL);
+ if (!entry) {
+ zswap_reject_kmemcache_fail++;
+ ret = -ENOMEM;
+ goto reject;
+ }
+
+ if (zswap_same_filled_pages_enabled) {
+ src = kmap_atomic(page);
+ if (zswap_is_page_same_filled(src, &value)) {
+ kunmap_atomic(src);
+ entry->offset = offset;
+ entry->length = 0;
+ entry->value = value;
+ atomic_inc(&zswap_same_filled_pages);
+ goto insert_entry;
+ }
+ kunmap_atomic(src);
+ }
+
+ /* if entry is successfully added, it keeps the reference */
+ entry->pool = zswap_pool_current_get();
+ if (!entry->pool) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
+ /* compress */
+ dst = get_cpu_var(zswap_dstmem);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
+ src = kmap_atomic(page);
+ ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
+ kunmap_atomic(src);
+ put_cpu_ptr(entry->pool->tfm);
+ if (ret) {
+ ret = -EINVAL;
+ goto put_dstmem;
+ }
+
+ /* store */
+ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(entry->pool->zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+ if (ret == -ENOSPC) {
+ zswap_reject_compress_poor++;
+ goto put_dstmem;
+ }
+ if (ret) {
+ zswap_reject_alloc_fail++;
+ goto put_dstmem;
+ }
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+ memcpy(buf, &zhdr, hlen);
+ memcpy(buf + hlen, dst, dlen);
+ zpool_unmap_handle(entry->pool->zpool, handle);
+ put_cpu_var(zswap_dstmem);
+
+ /* populate entry */
+ entry->offset = offset;
+ entry->handle = handle;
+ entry->length = dlen;
+
+insert_entry:
+ /* map */
+ spin_lock(&tree->lock);
+ do {
+ ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
+ if (ret == -EEXIST) {
+ zswap_duplicate_entry++;
+ /* remove from rbtree */
+ zswap_rb_erase(&tree->rbroot, dupentry);
+ zswap_entry_put(tree, dupentry);
+ }
+ } while (ret == -EEXIST);
+ spin_unlock(&tree->lock);
+
+ /* update stats */
+ atomic_inc(&zswap_stored_pages);
+ zswap_update_total_size();
+
+ return 0;
+
+put_dstmem:
+ put_cpu_var(zswap_dstmem);
+ zswap_pool_put(entry->pool);
+freepage:
+ zswap_entry_cache_free(entry);
+reject:
+ return ret;
+}
+
+/*
+ * returns 0 if the page was successfully decompressed
+ * return -1 on entry not found or error
+*/
+static int zswap_frontswap_load(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry;
+ struct crypto_comp *tfm;
+ u8 *src, *dst;
+ unsigned int dlen;
+ int ret;
+
+ /* find */
+ spin_lock(&tree->lock);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was written back */
+ spin_unlock(&tree->lock);
+ return -1;
+ }
+ spin_unlock(&tree->lock);
+
+ if (!entry->length) {
+ dst = kmap_atomic(page);
+ zswap_fill_page(dst, entry->value);
+ kunmap_atomic(dst);
+ goto freeentry;
+ }
+
+ /* decompress */
+ dlen = PAGE_SIZE;
+ src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+ if (zpool_evictable(entry->pool->zpool))
+ src += sizeof(struct zswap_header);
+ dst = kmap_atomic(page);
+ tfm = *get_cpu_ptr(entry->pool->tfm);
+ ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+ put_cpu_ptr(entry->pool->tfm);
+ kunmap_atomic(dst);
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ BUG_ON(ret);
+
+freeentry:
+ spin_lock(&tree->lock);
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+ return 0;
+}
+
+/* frees an entry in zswap */
+static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry;
+
+ /* find */
+ spin_lock(&tree->lock);
+ entry = zswap_rb_search(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was written back */
+ spin_unlock(&tree->lock);
+ return;
+ }
+
+ /* remove from rbtree */
+ zswap_rb_erase(&tree->rbroot, entry);
+
+ /* drop the initial reference from entry creation */
+ zswap_entry_put(tree, entry);
+
+ spin_unlock(&tree->lock);
+}
+
+/* frees all zswap entries for the given swap type */
+static void zswap_frontswap_invalidate_area(unsigned type)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry, *n;
+
+ if (!tree)
+ return;
+
+ /* walk the tree and free everything */
+ spin_lock(&tree->lock);
+ rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
+ zswap_free_entry(entry);
+ tree->rbroot = RB_ROOT;
+ spin_unlock(&tree->lock);
+ kfree(tree);
+ zswap_trees[type] = NULL;
+}
+
+static void zswap_frontswap_init(unsigned type)
+{
+ struct zswap_tree *tree;
+
+ tree = kzalloc(sizeof(*tree), GFP_KERNEL);
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ zswap_trees[type] = tree;
+}
+
+static struct frontswap_ops zswap_frontswap_ops = {
+ .store = zswap_frontswap_store,
+ .load = zswap_frontswap_load,
+ .invalidate_page = zswap_frontswap_invalidate_page,
+ .invalidate_area = zswap_frontswap_invalidate_area,
+ .init = zswap_frontswap_init
+};
+
+/*********************************
+* debugfs functions
+**********************************/
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static struct dentry *zswap_debugfs_root;
+
+static int __init zswap_debugfs_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
+
+ debugfs_create_u64("pool_limit_hit", 0444,
+ zswap_debugfs_root, &zswap_pool_limit_hit);
+ debugfs_create_u64("reject_reclaim_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_reclaim_fail);
+ debugfs_create_u64("reject_alloc_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_alloc_fail);
+ debugfs_create_u64("reject_kmemcache_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_kmemcache_fail);
+ debugfs_create_u64("reject_compress_poor", 0444,
+ zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("written_back_pages", 0444,
+ zswap_debugfs_root, &zswap_written_back_pages);
+ debugfs_create_u64("duplicate_entry", 0444,
+ zswap_debugfs_root, &zswap_duplicate_entry);
+ debugfs_create_u64("pool_total_size", 0444,
+ zswap_debugfs_root, &zswap_pool_total_size);
+ debugfs_create_atomic_t("stored_pages", 0444,
+ zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_atomic_t("same_filled_pages", 0444,
+ zswap_debugfs_root, &zswap_same_filled_pages);
+
+ return 0;
+}
+
+static void __exit zswap_debugfs_exit(void)
+{
+ debugfs_remove_recursive(zswap_debugfs_root);
+}
+#else
+static int __init zswap_debugfs_init(void)
+{
+ return 0;
+}
+
+static void __exit zswap_debugfs_exit(void) { }
+#endif
+
+/*********************************
+* module init and exit
+**********************************/
+static int __init init_zswap(void)
+{
+ struct zswap_pool *pool;
+ int ret;
+
+ zswap_init_started = true;
+
+ if (zswap_entry_cache_create()) {
+ pr_err("entry cache creation failed\n");
+ goto cache_fail;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
+ zswap_dstmem_prepare, zswap_dstmem_dead);
+ if (ret) {
+ pr_err("dstmem alloc failed\n");
+ goto dstmem_fail;
+ }
+
+ ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
+ "mm/zswap_pool:prepare",
+ zswap_cpu_comp_prepare,
+ zswap_cpu_comp_dead);
+ if (ret)
+ goto hp_fail;
+
+ pool = __zswap_pool_create_fallback();
+ if (pool) {
+ pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+ zpool_get_type(pool->zpool));
+ list_add(&pool->list, &zswap_pools);
+ zswap_has_pool = true;
+ } else {
+ pr_err("pool creation failed\n");
+ zswap_enabled = false;
+ }
+
+ shrink_wq = create_workqueue("zswap-shrink");
+ if (!shrink_wq)
+ goto fallback_fail;
+
+ frontswap_register_ops(&zswap_frontswap_ops);
+ if (zswap_debugfs_init())
+ pr_warn("debugfs initialization failed\n");
+ return 0;
+
+fallback_fail:
+ if (pool)
+ zswap_pool_destroy(pool);
+hp_fail:
+ cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
+dstmem_fail:
+ zswap_entry_cache_destroy();
+cache_fail:
+ /* if built-in, we aren't unloaded on failure; don't allow use */
+ zswap_init_failed = true;
+ zswap_enabled = false;
+ return -ENOMEM;
+}
+/* must be late so crypto has time to come up */
+late_initcall(init_zswap);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
+MODULE_DESCRIPTION("Compressed cache for swap pages");