summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig44
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/backing-dev.c25
-rw-r--r--mm/cma.c32
-rw-r--r--mm/cma.h5
-rw-r--r--mm/cma_sysfs.c15
-rw-r--r--mm/compaction.c355
-rw-r--r--mm/damon/Kconfig7
-rw-r--r--mm/damon/core.c120
-rw-r--r--mm/damon/dbgfs.c26
-rw-r--r--mm/damon/paddr.c2
-rw-r--r--mm/damon/reclaim.c53
-rw-r--r--mm/damon/sysfs-common.h8
-rw-r--r--mm/damon/sysfs-schemes.c146
-rw-r--r--mm/damon/sysfs.c54
-rw-r--r--mm/debug.c130
-rw-r--r--mm/filemap.c70
-rw-r--r--mm/gup.c14
-rw-r--r--mm/huge_memory.c442
-rw-r--r--mm/hugetlb.c433
-rw-r--r--mm/internal.h123
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/generic.c22
-rw-r--r--mm/kasan/kasan_test.c79
-rw-r--r--mm/kasan/kasan_test_module.c4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/kasan/shadow.c11
-rw-r--r--mm/khugepaged.c50
-rw-r--r--mm/kmsan/core.c15
-rw-r--r--mm/kmsan/hooks.c36
-rw-r--r--mm/ksm.c17
-rw-r--r--mm/list_lru.c20
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c126
-rw-r--r--mm/memfd.c47
-rw-r--r--mm/memory-failure.c11
-rw-r--r--mm/memory-tiers.c26
-rw-r--r--mm/memory.c401
-rw-r--r--mm/memory_hotplug.c34
-rw-r--r--mm/mempolicy.c507
-rw-r--r--mm/mempool.c13
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/mmap.c112
-rw-r--r--mm/mmu_gather.c111
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c390
-rw-r--r--mm/page_alloc.c209
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c377
-rw-r--r--mm/page_table_check.c11
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/ptdump.c22
-rw-r--r--mm/readahead.c9
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c66
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c31
-rw-r--r--mm/slub.c116
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c197
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c32
-rw-r--r--mm/swapfile.c54
-rw-r--r--mm/userfaultfd.c494
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c1206
-rw-r--r--mm/vmscan.c256
-rw-r--r--mm/workingset.c1
-rw-r--r--mm/z3fold.c5
-rw-r--r--mm/zsmalloc.c123
-rw-r--r--mm/zswap.c1831
76 files changed, 6024 insertions, 3257 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a..b1448aa81e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.
-config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
- bool "Invalidate zswap entries when pages are loaded"
- depends on ZSWAP
- help
- If selected, exclusive loads for zswap will be enabled at boot,
- otherwise it will be disabled.
-
- If exclusive loads are enabled, when a page is loaded from zswap,
- the zswap entry is invalidated at once, as opposed to leaving it
- in zswap until the swap entry is freed.
-
- This avoids having two copies of the same page in memory
- (compressed and uncompressed) after faulting in a page from zswap.
- The cost is that if the page was never dirtied and needs to be
- swapped out again, it will be re-compressed.
-
config ZSWAP_SHRINKER_DEFAULT_ON
bool "Shrink the zswap pool on memory pressure"
depends on ZSWAP
@@ -599,7 +583,7 @@ config MEMORY_BALLOON
# support for memory balloon compaction
config BALLOON_COMPACTION
bool "Allow for balloon memory compaction/migration"
- def_bool y
+ default y
depends on COMPACTION && MEMORY_BALLOON
help
Memory fragmentation introduced by ballooning might reduce
@@ -614,7 +598,7 @@ config BALLOON_COMPACTION
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
- def_bool y
+ default y
select MIGRATION
depends on MMU
help
@@ -637,7 +621,6 @@ config COMPACT_UNEVICTABLE_DEFAULT
# support for free page reporting
config PAGE_REPORTING
bool "Free page reporting"
- def_bool n
help
Free page reporting allows for the incremental acquisition of
free pages from the buddy allocator for the purpose of reporting
@@ -649,7 +632,7 @@ config PAGE_REPORTING
#
config MIGRATION
bool "Page migration"
- def_bool y
+ default y
depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
help
Allows the migration of the physical location of pages of processes
@@ -901,15 +884,6 @@ config CMA
If unsure, say "n".
-config CMA_DEBUG
- bool "CMA debug messages (DEVELOPMENT)"
- depends on DEBUG_KERNEL && CMA
- help
- Turns on debug messages in CMA. This produces KERN_DEBUG
- messages for every CMA call as well as various messages while
- processing calls such as dma_alloc_from_contiguous().
- This option does not affect warning and error messages.
-
config CMA_DEBUGFS
bool "CMA debugfs interface"
depends on CMA && DEBUG_FS
@@ -926,14 +900,14 @@ config CMA_SYSFS
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
- default 19 if NUMA
- default 7
+ default 20 if NUMA
+ default 8
help
CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum
number of CMA area in the system.
- If unsure, leave the default value "7" in UMA and "19" in NUMA.
+ If unsure, leave the default value "8" in UMA and "20" in NUMA.
config MEM_SOFT_DIRTY
bool "Track memory changes"
@@ -998,6 +972,12 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
+# Architectures which implement cpu_dcache_is_aliasing() to query
+# whether the data caches are aliased (VIVT or VIPT with dcache
+# aliasing) need to select this.
+config ARCH_HAS_CPU_CACHE_ALIASING
+ bool
+
config ARCH_HAS_CACHE_LINE_SIZE
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 321ab37999..afc72fde0f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -64,11 +64,11 @@ config SLUB_DEBUG_ON
help
Boot with debugging on by default. SLUB boots by default with
the runtime debug capabilities switched off. Enabling this is
- equivalent to specifying the "slub_debug" parameter on boot.
+ equivalent to specifying the "slab_debug" parameter on boot.
There is no support for more fine grained debug control like
- possible with slub_debug=xxx. SLUB debugging may be switched
+ possible with slab_debug=xxx. SLUB debugging may be switched
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
- "slub_debug=-".
+ "slab_debug=-".
config PAGE_OWNER
bool "Track page owner"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e039d05304..5f2be8c8df 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -372,31 +372,6 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-/*
- * This function is used when the first inode for this wb is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes. Since the write-out would
- * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
- * set up a timer which wakes the bdi thread up later.
- *
- * Note, we wouldn't bother setting up the timer, but this function is on the
- * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
- * by delaying the wake-up.
- *
- * We have to be careful not to postpone flush work if it is scheduled for
- * earlier. Thus we use queue_delayed_work().
- */
-void wb_wakeup_delayed(struct bdi_writeback *wb)
-{
- unsigned long timeout;
-
- timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_irq(&wb->work_lock);
- if (test_bit(WB_registered, &wb->state))
- queue_delayed_work(bdi_wq, &wb->dwork, timeout);
- spin_unlock_irq(&wb->work_lock);
-}
-
static void wb_update_bandwidth_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e53..3e9724716b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -14,11 +14,6 @@
#define pr_fmt(fmt) "cma: " fmt
-#ifdef CONFIG_CMA_DEBUG
-#ifndef DEBUG
-# define DEBUG
-#endif
-#endif
#define CREATE_TRACE_POINTS
#include <linux/memblock.h>
@@ -187,10 +182,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* alignment should be aligned with order_per_bit */
- if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit))
- return -EINVAL;
-
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
@@ -387,7 +378,6 @@ err:
return ret;
}
-#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
unsigned long next_zero_bit, next_set_bit, nr_zero;
@@ -412,9 +402,6 @@ static void cma_debug_show_areas(struct cma *cma)
pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
spin_unlock_irq(&cma->lock);
}
-#else
-static inline void cma_debug_show_areas(struct cma *cma) { }
-#endif
/**
* cma_alloc() - allocate pages from contiguous area
@@ -436,17 +423,18 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
unsigned long i;
struct page *page = NULL;
int ret = -ENOMEM;
+ const char *name = cma ? cma->name : NULL;
+
+ trace_cma_alloc_start(name, count, align);
if (!cma || !cma->count || !cma->bitmap)
- goto out;
+ return page;
pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
(void *)cma, cma->name, count, align);
if (!count)
- goto out;
-
- trace_cma_alloc_start(cma->name, count, align);
+ return page;
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
@@ -454,7 +442,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- goto out;
+ return page;
for (;;) {
spin_lock_irq(&cma->lock);
@@ -496,8 +484,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
-
/*
* CMA can allocate multiple page blocks, which results in different
* blocks being marked with different tags. Reset the tags to ignore
@@ -515,14 +501,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
-out:
+ trace_cma_alloc_finish(name, pfn, page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
} else {
count_vm_event(CMA_ALLOC_FAIL);
- if (cma)
- cma_sysfs_account_fail_pages(cma, count);
+ cma_sysfs_account_fail_pages(cma, count);
}
return page;
@@ -573,6 +558,7 @@ bool cma_release(struct cma *cma, const struct page *pages,
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
+ cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
return true;
diff --git a/mm/cma.h b/mm/cma.h
index 88a0595670..ad61cc6dd4 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -27,6 +27,8 @@ struct cma {
atomic64_t nr_pages_succeeded;
/* the number of CMA page allocation failures */
atomic64_t nr_pages_failed;
+ /* the number of CMA page released */
+ atomic64_t nr_pages_released;
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
@@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma)
#ifdef CONFIG_CMA_SYSFS
void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages);
void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages);
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages);
#else
static inline void cma_sysfs_account_success_pages(struct cma *cma,
unsigned long nr_pages) {};
static inline void cma_sysfs_account_fail_pages(struct cma *cma,
unsigned long nr_pages) {};
+static inline void cma_sysfs_account_release_pages(struct cma *cma,
+ unsigned long nr_pages) {};
#endif
#endif
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index 56347d15b7..f50db39731 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages)
atomic64_add(nr_pages, &cma->nr_pages_failed);
}
+void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages)
+{
+ atomic64_add(nr_pages, &cma->nr_pages_released);
+}
+
static inline struct cma *cma_from_kobj(struct kobject *kobj)
{
return container_of(kobj, struct cma_kobject, kobj)->cma;
@@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj,
}
CMA_ATTR_RO(alloc_pages_fail);
+static ssize_t release_pages_success_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released));
+}
+CMA_ATTR_RO(release_pages_success);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj)
static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
+ &release_pages_success_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index b961db601d..807b58e6eb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -40,9 +40,22 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
{
count_vm_events(item, delta);
}
+
+/*
+ * order == -1 is expected when compacting proactively via
+ * 1. /proc/sys/vm/compact_memory
+ * 2. /sys/devices/system/node/nodex/compact
+ * 3. /proc/sys/vm/compaction_proactiveness
+ */
+static inline bool is_via_compact_memory(int order)
+{
+ return order == -1;
+}
+
#else
#define count_compact_event(item) do { } while (0)
#define count_compact_events(item, delta) do { } while (0)
+static inline bool is_via_compact_memory(int order) { return false; }
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -66,45 +79,56 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
#endif
-static unsigned long release_freepages(struct list_head *freelist)
+static void split_map_pages(struct list_head *freepages)
{
+ unsigned int i, order;
struct page *page, *next;
- unsigned long high_pfn = 0;
+ LIST_HEAD(tmp_list);
- list_for_each_entry_safe(page, next, freelist, lru) {
- unsigned long pfn = page_to_pfn(page);
- list_del(&page->lru);
- __free_page(page);
- if (pfn > high_pfn)
- high_pfn = pfn;
- }
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ list_for_each_entry_safe(page, next, &freepages[order], lru) {
+ unsigned int nr_pages;
- return high_pfn;
+ list_del(&page->lru);
+
+ nr_pages = 1 << order;
+
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ if (order)
+ split_page(page, order);
+
+ for (i = 0; i < nr_pages; i++) {
+ list_add(&page->lru, &tmp_list);
+ page++;
+ }
+ }
+ list_splice_init(&tmp_list, &freepages[0]);
+ }
}
-static void split_map_pages(struct list_head *list)
+static unsigned long release_free_list(struct list_head *freepages)
{
- unsigned int i, order, nr_pages;
- struct page *page, *next;
- LIST_HEAD(tmp_list);
-
- list_for_each_entry_safe(page, next, list, lru) {
- list_del(&page->lru);
+ int order;
+ unsigned long high_pfn = 0;
- order = page_private(page);
- nr_pages = 1 << order;
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct page *page, *next;
- post_alloc_hook(page, order, __GFP_MOVABLE);
- if (order)
- split_page(page, order);
+ list_for_each_entry_safe(page, next, &freepages[order], lru) {
+ unsigned long pfn = page_to_pfn(page);
- for (i = 0; i < nr_pages; i++) {
- list_add(&page->lru, &tmp_list);
- page++;
+ list_del(&page->lru);
+ /*
+ * Convert free pages into post allocation pages, so
+ * that we can free them via __free_page.
+ */
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ __free_pages(page, order);
+ if (pfn > high_pfn)
+ high_pfn = pfn;
}
}
-
- list_splice(&tmp_list, list);
+ return high_pfn;
}
#ifdef CONFIG_COMPACTION
@@ -657,7 +681,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
nr_scanned += isolated - 1;
total_isolated += isolated;
cc->nr_freepages += isolated;
- list_add_tail(&page->lru, freelist);
+ list_add_tail(&page->lru, &freelist[order]);
if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
blockpfn += isolated;
@@ -722,7 +746,11 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
- LIST_HEAD(freelist);
+ int order;
+ struct list_head tmp_freepages[NR_PAGE_ORDERS];
+
+ for (order = 0; order < NR_PAGE_ORDERS; order++)
+ INIT_LIST_HEAD(&tmp_freepages[order]);
pfn = start_pfn;
block_start_pfn = pageblock_start_pfn(pfn);
@@ -753,7 +781,7 @@ isolate_freepages_range(struct compact_control *cc,
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, 0, true);
+ block_end_pfn, tmp_freepages, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -770,15 +798,15 @@ isolate_freepages_range(struct compact_control *cc,
*/
}
- /* __isolate_free_page() does not map the pages */
- split_map_pages(&freelist);
-
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
- release_freepages(&freelist);
+ release_free_list(tmp_freepages);
return 0;
}
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(tmp_freepages);
+
/* We don't use freelists for anything. */
return pfn;
}
@@ -817,6 +845,32 @@ static bool too_many_isolated(struct compact_control *cc)
}
/**
+ * skip_isolation_on_order() - determine when to skip folio isolation based on
+ * folio order and compaction target order
+ * @order: to-be-isolated folio order
+ * @target_order: compaction target order
+ *
+ * This avoids unnecessary folio isolations during compaction.
+ */
+static bool skip_isolation_on_order(int order, int target_order)
+{
+ /*
+ * Unless we are performing global compaction (i.e.,
+ * is_via_compact_memory), skip any folios that are larger than the
+ * target order: we wouldn't be here if we'd have a free folio with
+ * the desired target_order, so migrating this folio would likely fail
+ * later.
+ */
+ if (!is_via_compact_memory(target_order) && order >= target_order)
+ return true;
+ /*
+ * We limit memory compaction to pageblocks and won't try
+ * creating free blocks of memory that are larger than that.
+ */
+ return order >= pageblock_order;
+}
+
+/**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
* @cc: Compaction control structure.
@@ -947,7 +1001,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
valid_page = page;
}
- if (PageHuge(page) && cc->alloc_contig) {
+ if (PageHuge(page)) {
+ /*
+ * skip hugetlbfs if we are not compacting for pages
+ * bigger than its order. THPs and other compound pages
+ * are handled below.
+ */
+ if (!cc->alloc_contig) {
+ const unsigned int order = compound_order(page);
+
+ if (order <= MAX_PAGE_ORDER) {
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
+ }
+ goto isolate_fail;
+ }
+ /* for alloc_contig case */
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
@@ -1008,21 +1077,24 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
- * Regardless of being on LRU, compound pages such as THP and
- * hugetlbfs are not to be compacted unless we are attempting
- * an allocation much larger than the huge page size (eg CMA).
- * We can potentially save a lot of iterations if we skip them
- * at once. The check is racy, but we can consider only valid
- * values and the only danger is skipping too much.
+ * Regardless of being on LRU, compound pages such as THP
+ * (hugetlbfs is handled above) are not to be compacted unless
+ * we are attempting an allocation larger than the compound
+ * page size. We can potentially save a lot of iterations if we
+ * skip them at once. The check is racy, but we can consider
+ * only valid values and the only danger is skipping too much.
*/
if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
- if (likely(order <= MAX_PAGE_ORDER)) {
- low_pfn += (1UL << order) - 1;
- nr_scanned += (1UL << order) - 1;
+ /* Skip based on page order and compaction target order. */
+ if (skip_isolation_on_order(order, cc->order)) {
+ if (order <= MAX_PAGE_ORDER) {
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
+ }
+ goto isolate_fail;
}
- goto isolate_fail;
}
/*
@@ -1165,10 +1237,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
- * folio become large since the non-locked check,
- * and it's on LRU.
+ * Check LRU folio order under the lock
*/
- if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+ if (unlikely(skip_isolation_on_order(folio_order(folio),
+ cc->order) &&
+ !cc->alloc_contig)) {
low_pfn += folio_nr_pages(folio) - 1;
nr_scanned += folio_nr_pages(folio) - 1;
folio_set_lru(folio);
@@ -1365,12 +1438,14 @@ static bool suitable_migration_target(struct compact_control *cc,
{
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
+ int order = cc->order > 0 ? cc->order : pageblock_order;
+
/*
* We are checking page_order without zone->lock taken. But
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (buddy_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= order)
return false;
}
@@ -1458,7 +1533,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
if (!page)
return;
- isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+ isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
if (start_pfn == end_pfn && !cc->no_set_skip_hint)
@@ -1587,7 +1662,7 @@ static void fast_isolate_freepages(struct compact_control *cc)
nr_scanned += nr_isolated - 1;
total_isolated += nr_isolated;
cc->nr_freepages += nr_isolated;
- list_add_tail(&page->lru, &cc->freepages);
+ list_add_tail(&page->lru, &cc->freepages[order]);
count_compact_events(COMPACTISOLATED, nr_isolated);
} else {
/* If isolation fails, abort the search */
@@ -1664,13 +1739,12 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
- struct list_head *freelist = &cc->freepages;
unsigned int stride;
/* Try a small search of the free lists for a candidate */
fast_isolate_freepages(cc);
if (cc->nr_freepages)
- goto splitmap;
+ return;
/*
* Initialise the free scanner. The starting point is where we last
@@ -1730,7 +1804,7 @@ static void isolate_freepages(struct compact_control *cc)
/* Found a block suitable for isolating free pages from. */
nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, freelist, stride, false);
+ block_end_pfn, cc->freepages, stride, false);
/* Update the skip hint if the full pageblock was scanned */
if (isolate_start_pfn == block_end_pfn)
@@ -1771,10 +1845,6 @@ static void isolate_freepages(struct compact_control *cc)
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;
-
-splitmap:
- /* __isolate_free_page() does not map the pages */
- split_map_pages(freelist);
}
/*
@@ -1785,19 +1855,47 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
struct folio *dst;
+ int order = folio_order(src);
+ bool has_isolated_pages = false;
+ int start_order;
+ struct page *freepage;
+ unsigned long size;
+
+again:
+ for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++)
+ if (!list_empty(&cc->freepages[start_order]))
+ break;
- if (list_empty(&cc->freepages)) {
- isolate_freepages(cc);
-
- if (list_empty(&cc->freepages))
+ /* no free pages in the list */
+ if (start_order == NR_PAGE_ORDERS) {
+ if (has_isolated_pages)
return NULL;
+ isolate_freepages(cc);
+ has_isolated_pages = true;
+ goto again;
}
- dst = list_entry(cc->freepages.next, struct folio, lru);
- list_del(&dst->lru);
- cc->nr_freepages--;
+ freepage = list_first_entry(&cc->freepages[start_order], struct page,
+ lru);
+ size = 1 << start_order;
+
+ list_del(&freepage->lru);
+
+ while (start_order > order) {
+ start_order--;
+ size >>= 1;
+
+ list_add(&freepage[size].lru, &cc->freepages[start_order]);
+ set_page_private(&freepage[size], start_order);
+ }
+ dst = (struct folio *)freepage;
- return dst;
+ post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+ if (order)
+ prep_compound_page(&dst->page, order);
+ cc->nr_freepages -= 1 << order;
+ cc->nr_migratepages -= 1 << order;
+ return page_rmappable_folio(&dst->page);
}
/*
@@ -1808,9 +1906,19 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data)
static void compaction_free(struct folio *dst, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
+ int order = folio_order(dst);
+ struct page *page = &dst->page;
- list_add(&dst->lru, &cc->freepages);
- cc->nr_freepages++;
+ if (folio_put_testzero(dst)) {
+ free_pages_prepare(page, order);
+ list_add(&dst->lru, &cc->freepages[order]);
+ cc->nr_freepages += 1 << order;
+ }
+ cc->nr_migratepages += 1 << order;
+ /*
+ * someone else has referenced the page, we cannot take it back to our
+ * free list.
+ */
}
/* possible outcome of isolate_migratepages */
@@ -2087,17 +2195,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
}
/*
- * order == -1 is expected when compacting proactively via
- * 1. /proc/sys/vm/compact_memory
- * 2. /sys/devices/system/node/nodex/compact
- * 3. /proc/sys/vm/compaction_proactiveness
- */
-static inline bool is_via_compact_memory(int order)
-{
- return order == -1;
-}
-
-/*
* Determine whether kswapd is (or recently was!) running on this node.
*
* pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
@@ -2433,7 +2530,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
- unsigned int nr_succeeded = 0;
+ unsigned int nr_succeeded = 0, nr_migratepages;
+ int order;
/*
* These counters track activities during zone compaction. Initialize
@@ -2443,7 +2541,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
cc->total_free_scanned = 0;
cc->nr_migratepages = 0;
cc->nr_freepages = 0;
- INIT_LIST_HEAD(&cc->freepages);
+ for (order = 0; order < NR_PAGE_ORDERS; order++)
+ INIT_LIST_HEAD(&cc->freepages[order]);
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfp_migratetype(cc->gfp_mask);
@@ -2551,11 +2650,17 @@ rescan:
pageblock_start_pfn(cc->migrate_pfn - 1));
}
+ /*
+ * Record the number of pages to migrate since the
+ * compaction_alloc/free() will update cc->nr_migratepages
+ * properly.
+ */
+ nr_migratepages = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc, nr_succeeded);
+ trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2629,7 +2734,7 @@ out:
* so we don't leave any returned pages behind in the next attempt.
*/
if (cc->nr_freepages > 0) {
- unsigned long free_pfn = release_freepages(&cc->freepages);
+ unsigned long free_pfn = release_free_list(cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
@@ -2648,7 +2753,6 @@ out:
trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
- VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
return ret;
@@ -2783,25 +2887,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * Compact all zones within a node till each zone's fragmentation score
- * reaches within proactive compaction thresholds (as determined by the
- * proactiveness tunable).
+ * compact_node() - compact all zones within a node
+ * @pgdat: The node page data
+ * @proactive: Whether the compaction is proactive
*
- * It is possible that the function returns before reaching score targets
- * due to various back-off conditions, such as, contention on per-node or
- * per-zone locks.
+ * For proactive compaction, compact till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable), it is possible that the function returns before
+ * reaching score targets due to various back-off conditions, such as,
+ * contention on per-node or per-zone locks.
*/
-static void proactive_compact_node(pg_data_t *pgdat)
+static int compact_node(pg_data_t *pgdat, bool proactive)
{
int zoneid;
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .mode = MIGRATE_SYNC_LIGHT,
+ .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
.gfp_mask = GFP_KERNEL,
- .proactive_compaction = true,
+ .proactive_compaction = proactive,
};
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
@@ -2809,54 +2915,39 @@ static void proactive_compact_node(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
cc.zone = zone;
compact_zone(&cc, NULL);
- count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
- cc.total_migrate_scanned);
- count_compact_events(KCOMPACTD_FREE_SCANNED,
- cc.total_free_scanned);
+ if (proactive) {
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+ }
}
-}
-
-/* Compact all zones within a node */
-static void compact_node(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
- int zoneid;
- struct zone *zone;
- struct compact_control cc = {
- .order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- .whole_zone = true,
- .gfp_mask = GFP_KERNEL,
- };
-
-
- for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-
- zone = &pgdat->node_zones[zoneid];
- if (!populated_zone(zone))
- continue;
-
- cc.zone = zone;
- compact_zone(&cc, NULL);
- }
+ return 0;
}
-/* Compact all nodes in the system */
-static void compact_nodes(void)
+/* Compact all zones of all nodes in the system */
+static int compact_nodes(void)
{
- int nid;
+ int ret, nid;
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
- for_each_online_node(nid)
- compact_node(nid);
+ for_each_online_node(nid) {
+ ret = compact_node(NODE_DATA(nid), false);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
@@ -2902,9 +2993,9 @@ static int sysctl_compaction_handler(struct ctl_table *table, int write,
return -EINVAL;
if (write)
- compact_nodes();
+ ret = compact_nodes();
- return 0;
+ return ret;
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
@@ -2918,7 +3009,7 @@ static ssize_t compact_store(struct device *dev,
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
- compact_node(nid);
+ compact_node(NODE_DATA(nid), false);
}
return count;
@@ -3127,7 +3218,7 @@ static int kcompactd(void *p)
unsigned int prev_score, score;
prev_score = fragmentation_score_node(pgdat);
- proactive_compact_node(pgdat);
+ compact_node(pgdat, true);
score = fragmentation_score_node(pgdat);
/*
* Defer proactive compaction if the fragmentation
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 29f43fbc2e..fecb817241 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS
+config DAMON_DBGFS_DEPRECATED
bool "DAMON debugfs interface (DEPRECATED!)"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
@@ -84,6 +84,11 @@ config DAMON_DBGFS
(DAMON_SYSFS). If you depend on this and cannot move, please report
your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
+config DAMON_DBGFS
+ bool
+ default y
+ depends on DAMON_DBGFS_DEPRECATED
+
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_DBGFS && KUNIT=y
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5b325749fc..6d503c1c12 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -11,6 +11,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mm.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -299,12 +300,48 @@ void damos_destroy_filter(struct damos_filter *f)
damos_free_filter(f);
}
-/* initialize private fields of damos_quota and return the pointer */
-static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+struct damos_quota_goal *damos_new_quota_goal(
+ enum damos_quota_goal_metric metric,
+ unsigned long target_value)
{
+ struct damos_quota_goal *goal;
+
+ goal = kmalloc(sizeof(*goal), GFP_KERNEL);
+ if (!goal)
+ return NULL;
+ goal->metric = metric;
+ goal->target_value = target_value;
+ INIT_LIST_HEAD(&goal->list);
+ return goal;
+}
+
+void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g)
+{
+ list_add_tail(&g->list, &q->goals);
+}
+
+static void damos_del_quota_goal(struct damos_quota_goal *g)
+{
+ list_del(&g->list);
+}
+
+static void damos_free_quota_goal(struct damos_quota_goal *g)
+{
+ kfree(g);
+}
+
+void damos_destroy_quota_goal(struct damos_quota_goal *g)
+{
+ damos_del_quota_goal(g);
+ damos_free_quota_goal(g);
+}
+
+/* initialize fields of @quota that normally API users wouldn't set */
+static struct damos_quota *damos_quota_init(struct damos_quota *quota)
+{
+ quota->esz = 0;
quota->total_charged_sz = 0;
quota->total_charged_ns = 0;
- quota->esz = 0;
quota->charged_sz = 0;
quota->charged_from = 0;
quota->charge_target_from = NULL;
@@ -336,7 +373,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
- scheme->quota = *(damos_quota_init_priv(quota));
+ scheme->quota = *(damos_quota_init(quota));
+ /* quota.goals should be separately set by caller */
+ INIT_LIST_HEAD(&scheme->quota.goals);
scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
@@ -373,8 +412,12 @@ static void damon_free_scheme(struct damos *s)
void damon_destroy_scheme(struct damos *s)
{
+ struct damos_quota_goal *g, *g_next;
struct damos_filter *f, *next;
+ damos_for_each_quota_goal_safe(g, g_next, &s->quota)
+ damos_destroy_quota_goal(g);
+
damos_for_each_filter_safe(f, next, s)
damos_destroy_filter(f);
damon_del_scheme(s);
@@ -1083,21 +1126,78 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input,
return min_input;
}
-/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */
+#ifdef CONFIG_PSI
+
+static u64 damos_get_some_mem_psi_total(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return 0;
+ return div_u64(psi_system.total[PSI_AVGS][PSI_MEM * 2],
+ NSEC_PER_USEC);
+}
+
+#else /* CONFIG_PSI */
+
+static inline u64 damos_get_some_mem_psi_total(void)
+{
+ return 0;
+};
+
+#endif /* CONFIG_PSI */
+
+static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
+{
+ u64 now_psi_total;
+
+ switch (goal->metric) {
+ case DAMOS_QUOTA_USER_INPUT:
+ /* User should already set goal->current_value */
+ break;
+ case DAMOS_QUOTA_SOME_MEM_PSI_US:
+ now_psi_total = damos_get_some_mem_psi_total();
+ goal->current_value = now_psi_total - goal->last_psi_total;
+ goal->last_psi_total = now_psi_total;
+ break;
+ default:
+ break;
+ }
+}
+
+/* Return the highest score since it makes schemes least aggressive */
+static unsigned long damos_quota_score(struct damos_quota *quota)
+{
+ struct damos_quota_goal *goal;
+ unsigned long highest_score = 0;
+
+ damos_for_each_quota_goal(goal, quota) {
+ damos_set_quota_goal_current_value(goal);
+ highest_score = max(highest_score,
+ goal->current_value * 10000 /
+ goal->target_value);
+ }
+
+ return highest_score;
+}
+
+/*
+ * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty
+ */
static void damos_set_effective_quota(struct damos_quota *quota)
{
unsigned long throughput;
unsigned long esz;
- if (!quota->ms && !quota->get_score) {
+ if (!quota->ms && list_empty(&quota->goals)) {
quota->esz = quota->sz;
return;
}
- if (quota->get_score) {
+ if (!list_empty(&quota->goals)) {
+ unsigned long score = damos_quota_score(quota);
+
quota->esz_bp = damon_feed_loop_next_input(
max(quota->esz_bp, 10000UL),
- quota->get_score(quota->get_score_arg));
+ score);
esz = quota->esz_bp / 10000;
}
@@ -1107,7 +1207,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->total_charged_ns;
else
throughput = PAGE_SIZE * 1024;
- if (quota->get_score)
+ if (!list_empty(&quota->goals))
esz = min(throughput * quota->ms, esz);
else
esz = throughput * quota->ms;
@@ -1127,7 +1227,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
unsigned long cumulated_sz;
unsigned int score, max_score = 0;
- if (!quota->ms && !quota->sz && !quota->get_score)
+ if (!quota->ms && !quota->sz && list_empty(&quota->goals))
return;
/* New charge window starts */
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 7dac24e69e..2461cfe2e9 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -15,6 +15,11 @@
#include <linux/page_idle.h>
#include <linux/slab.h>
+#define DAMON_DBGFS_DEPRECATION_NOTICE \
+ "DAMON debugfs interface is deprecated, so users should move " \
+ "to DAMON_SYSFS. If you cannot, please report your usecase to " \
+ "damon@lists.linux.dev and linux-mm@kvack.org.\n"
+
static struct damon_ctx **dbgfs_ctxs;
static int dbgfs_nr_ctxs;
static struct dentry **dbgfs_dirs;
@@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock);
static void damon_dbgfs_warn_deprecation(void)
{
- pr_warn_once("DAMON debugfs interface is deprecated, "
- "so users should move to DAMON_SYSFS. If you cannot, "
- "please report your usecase to damon@lists.linux.dev and "
- "linux-mm@kvack.org.\n");
+ pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
}
/*
@@ -805,6 +807,14 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
damon_destroy_ctx(ctx);
}
+static ssize_t damon_dbgfs_deprecated_read(struct file *file,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
+
+ return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
+}
+
/*
* Make a context of @name and create a debugfs directory for it.
*
@@ -1056,6 +1066,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
return nonseekable_open(inode, file);
}
+static const struct file_operations deprecated_fops = {
+ .read = damon_dbgfs_deprecated_read,
+};
+
static const struct file_operations mk_contexts_fops = {
.open = damon_dbgfs_static_file_open,
.write = dbgfs_mk_context_write,
@@ -1076,9 +1090,9 @@ static int __init __damon_dbgfs_init(void)
{
struct dentry *dbgfs_root;
const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on"};
+ "monitor_on_DEPRECATED", "DEPRECATED"};
const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops};
+ &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
int i;
dbgfs_root = debugfs_create_dir("damon", NULL);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 081e2a3257..5e6dc31207 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
put_folio:
folio_put(folio);
}
- applied = reclaim_pages(&folio_list);
+ applied = reclaim_pages(&folio_list, false);
cond_resched();
return applied * PAGE_SIZE;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 66e190f037..9bd341d62b 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -62,6 +62,36 @@ static struct damos_quota damon_reclaim_quota = {
};
DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
+/*
+ * Desired level of memory pressure-stall time in microseconds.
+ *
+ * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+ * increases and decreases the effective level of the quota aiming this level of
+ * memory pressure is incurred. System-wide ``some`` memory PSI in microseconds
+ * per quota reset interval (``quota_reset_interval_ms``) is collected and
+ * compared to this value to see if the aim is satisfied. Value zero means
+ * disabling this auto-tuning feature.
+ *
+ * Disabled by default.
+ */
+static unsigned long quota_mem_pressure_us __read_mostly;
+module_param(quota_mem_pressure_us, ulong, 0600);
+
+/*
+ * User-specifiable feedback for auto-tuning of the effective quota.
+ *
+ * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically
+ * increases and decreases the effective level of the quota aiming receiving this
+ * feedback of value ``10,000`` from the user. DAMON_RECLAIM assumes the feedback
+ * value and the quota are positively proportional. Value zero means disabling
+ * this auto-tuning feature.
+ *
+ * Disabled by default.
+ *
+ */
+static unsigned long quota_autotune_feedback __read_mostly;
+module_param(quota_autotune_feedback, ulong, 0600);
+
static struct damos_watermarks damon_reclaim_wmarks = {
.metric = DAMOS_WMARK_FREE_MEM_RATE,
.interval = 5000000, /* 5 seconds */
@@ -159,11 +189,13 @@ static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
dst->charged_from = src->charged_from;
dst->charge_target_from = src->charge_target_from;
dst->charge_addr_from = src->charge_addr_from;
+ dst->esz_bp = src->esz_bp;
}
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme, *old_scheme;
+ struct damos_quota_goal *goal;
struct damos_filter *filter;
int err = 0;
@@ -180,6 +212,27 @@ static int damon_reclaim_apply_parameters(void)
damon_reclaim_copy_quota_status(&scheme->quota,
&old_scheme->quota);
}
+
+ if (quota_mem_pressure_us) {
+ goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US,
+ quota_mem_pressure_us);
+ if (!goal) {
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damos_add_quota_goal(&scheme->quota, goal);
+ }
+
+ if (quota_autotune_feedback) {
+ goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000);
+ if (!goal) {
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ goal->current_value = quota_autotune_feedback;
+ damos_add_quota_goal(&scheme->quota, goal);
+ }
+
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4c37a166eb..a63f51577c 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx, bool total_bytes_only);
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
+
bool damos_sysfs_regions_upd_done(void);
int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
@@ -57,5 +59,9 @@ int damon_sysfs_schemes_clear_regions(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
-void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
+int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx);
+
+void damos_sysfs_update_effective_quotas(
+ struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index ae0f0b314f..53a90ac678 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
*
* Once the tried regions update request is received, the request handling
* start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply() and
- * ->after_sampling() callbacks.
+ * of all schemes as 'idle' again, and register ->before_damos_apply()
+ * callback.
*
* Then, the first followup ->before_damos_apply() callback
* (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
- * is called only after the scheme is completely applied
- * to the given snapshot. Hence the callback knows the situation by showing
- * 'started' status, and sets the status as 'finished'. Then,
- * damon_sysfs_before_damos_apply() understands the situation by showing the
- * 'finished' status and do nothing.
+ * ->after_sampling() or ->after_aggregation() callback
+ * (damon_sysfs_cmd_request_callback()) after the call is called only after
+ * the scheme is completely applied to the given snapshot. Hence the callback
+ * knows the situation by showing 'started' status, and sets the status as
+ * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
+ * situation by showing the 'finished' status and do nothing.
*
* If DAMOS is not applied to any region due to any reasons including the
* access pattern, the watermarks, the quotas, and the filters,
@@ -826,15 +826,48 @@ static const struct kobj_type damon_sysfs_watermarks_ktype = {
struct damos_sysfs_quota_goal {
struct kobject kobj;
+ enum damos_quota_goal_metric metric;
unsigned long target_value;
unsigned long current_value;
};
+/* This should match with enum damos_action */
+static const char * const damos_sysfs_quota_goal_metric_strs[] = {
+ "user_input",
+ "some_mem_psi_us",
+};
+
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
{
return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL);
}
+static ssize_t target_metric_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj,
+ struct damos_sysfs_quota_goal, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damos_sysfs_quota_goal_metric_strs[goal->metric]);
+}
+
+static ssize_t target_metric_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj,
+ struct damos_sysfs_quota_goal, kobj);
+ enum damos_quota_goal_metric m;
+
+ for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) {
+ if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) {
+ goal->metric = m;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
static ssize_t target_value_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -880,6 +913,9 @@ static void damos_sysfs_quota_goal_release(struct kobject *kobj)
kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj));
}
+static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr =
+ __ATTR_RW_MODE(target_metric, 0600);
+
static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr =
__ATTR_RW_MODE(target_value, 0600);
@@ -887,6 +923,7 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
__ATTR_RW_MODE(current_value, 0600);
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
+ &damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
NULL,
@@ -1139,6 +1176,7 @@ struct damon_sysfs_quotas {
unsigned long ms;
unsigned long sz;
unsigned long reset_interval_ms;
+ unsigned long effective_sz; /* Effective size quota in bytes */
};
static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
@@ -1252,6 +1290,15 @@ static ssize_t reset_interval_ms_store(struct kobject *kobj,
return count;
}
+static ssize_t effective_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->effective_sz);
+}
+
static void damon_sysfs_quotas_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
@@ -1266,10 +1313,14 @@ static struct kobj_attribute damon_sysfs_quotas_sz_attr =
static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
__ATTR_RW_MODE(reset_interval_ms, 0600);
+static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr =
+ __ATTR_RO_MODE(effective_bytes, 0400);
+
static struct attribute *damon_sysfs_quotas_attrs[] = {
&damon_sysfs_quotas_ms_attr.attr,
&damon_sysfs_quotas_sz_attr.attr,
&damon_sysfs_quotas_reset_interval_ms_attr.attr,
+ &damon_sysfs_quotas_effective_bytes_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_quotas);
@@ -1868,35 +1919,35 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme,
return 0;
}
-static unsigned long damos_sysfs_get_quota_score(void *arg)
-{
- return (unsigned long)arg;
-}
-
-static void damos_sysfs_set_quota_score(
+static int damos_sysfs_set_quota_score(
struct damos_sysfs_quota_goals *sysfs_goals,
struct damos_quota *quota)
{
- struct damos_sysfs_quota_goal *sysfs_goal;
+ struct damos_quota_goal *goal, *next;
int i;
- quota->get_score = NULL;
- quota->get_score_arg = (void *)0;
+ damos_for_each_quota_goal_safe(goal, next, quota)
+ damos_destroy_quota_goal(goal);
+
for (i = 0; i < sysfs_goals->nr; i++) {
- sysfs_goal = sysfs_goals->goals_arr[i];
+ struct damos_sysfs_quota_goal *sysfs_goal =
+ sysfs_goals->goals_arr[i];
+
if (!sysfs_goal->target_value)
continue;
- /* Higher score makes scheme less aggressive */
- quota->get_score_arg = (void *)max(
- (unsigned long)quota->get_score_arg,
- sysfs_goal->current_value * 10000 /
+ goal = damos_new_quota_goal(sysfs_goal->metric,
sysfs_goal->target_value);
- quota->get_score = damos_sysfs_get_quota_score;
+ if (!goal)
+ return -ENOMEM;
+ if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT)
+ goal->current_value = sysfs_goal->current_value;
+ damos_add_quota_goal(quota, goal);
}
+ return 0;
}
-void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
+int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx)
{
struct damos *scheme;
@@ -1904,16 +1955,41 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
damon_for_each_scheme(scheme, ctx) {
struct damon_sysfs_scheme *sysfs_scheme;
+ int err;
/* user could have removed the scheme sysfs dir */
if (i >= sysfs_schemes->nr)
break;
sysfs_scheme = sysfs_schemes->schemes_arr[i];
- damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
+ err = damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
&scheme->quota);
+ if (err)
+ /* kdamond will clean up schemes and terminated */
+ return err;
i++;
}
+ return 0;
+}
+
+void damos_sysfs_update_effective_quotas(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_quotas *sysfs_quotas;
+
+ /* user could have removed the scheme sysfs dir */
+ if (schemes_idx >= sysfs_schemes->nr)
+ break;
+
+ sysfs_quotas =
+ sysfs_schemes->schemes_arr[schemes_idx++]->quotas;
+ sysfs_quotas->effective_sz = scheme->quota.esz;
+ }
}
static struct damos *damon_sysfs_mk_scheme(
@@ -1953,13 +2029,17 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- damos_sysfs_set_quota_score(sysfs_quotas->goals, &quota);
-
scheme = damon_new_scheme(&pattern, sysfs_scheme->action,
sysfs_scheme->apply_interval_us, &quota, &wmarks);
if (!scheme)
return NULL;
+ err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+
err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
if (err) {
damon_destroy_scheme(scheme);
@@ -1995,7 +2075,11 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
scheme->quota.weight_age = sysfs_weights->age;
- damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+ err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return;
+ }
scheme->wmarks.metric = sysfs_wmarks->metric;
scheme->wmarks.interval = sysfs_wmarks->interval_us;
@@ -2122,7 +2206,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
* callback is registered, damon_sysfs_lock should be held to ensure the
* regions directories exist.
*/
-static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
{
struct damon_sysfs_schemes *sysfs_schemes =
damon_sysfs_schemes_for_damos_callback;
@@ -2138,8 +2222,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
sysfs_regions->upd_status =
DAMOS_TRIED_REGIONS_UPD_FINISHED;
}
-
- return 0;
}
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
@@ -2212,7 +2294,6 @@ int damon_sysfs_schemes_update_regions_start(
damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
damos_regions_upd_total_bytes_only = total_bytes_only;
ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
return 0;
}
@@ -2241,7 +2322,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
{
damon_sysfs_schemes_for_damos_callback = NULL;
ctx->callback.before_damos_apply = NULL;
- ctx->callback.after_sampling = NULL;
damon_sysfs_schemes_region_idx = 0;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1f891e18b4..6fee383bc0 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1020,6 +1020,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: Update the
+ * effective size quota of the scheme in bytes.
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
+ /*
* @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
*/
NR_DAMON_SYSFS_CMDS,
@@ -1035,6 +1040,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
"update_schemes_tried_bytes",
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
+ "update_schemes_effective_quotas",
};
/*
@@ -1371,19 +1377,43 @@ static int damon_sysfs_commit_schemes_quota_goals(
ctx = sysfs_kdamond->damon_ctx;
sysfs_ctx = sysfs_kdamond->contexts->contexts_arr[0];
- damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx);
+ return damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx);
+}
+
+/*
+ * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas
+ * sysfs files.
+ * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes' effective quotas of specific kdamond and
+ * update the related values for sysfs files. This function should be called
+ * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the
+ * DAMON contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_effective_quotas(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+ damos_sysfs_update_effective_quotas(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
+
/*
* damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
* @c: The DAMON context of the callback.
* @active: Whether @c is not deactivated due to watermarks.
+ * @after_aggr: Whether this is called from after_aggregation() callback.
*
* This function is periodically called back from the kdamond thread for @c.
* Then, it checks if there is a waiting DAMON sysfs request and handles it.
*/
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
+ bool after_aggregation)
{
struct damon_sysfs_kdamond *kdamond;
bool total_bytes_only = false;
@@ -1401,6 +1431,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
err = damon_sysfs_upd_schemes_stats(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT:
+ if (!after_aggregation)
+ goto out;
err = damon_sysfs_commit_input(kdamond);
break;
case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
@@ -1418,6 +1450,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
goto keep_lock_out;
}
} else {
+ damos_sysfs_mark_finished_regions_updates(c);
/*
* Continue regions updating if DAMON is till
* active and the update for all schemes is not
@@ -1432,6 +1465,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
err = damon_sysfs_clear_schemes_regions(kdamond);
break;
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
+ err = damon_sysfs_upd_schemes_effective_quotas(kdamond);
+ break;
default:
break;
}
@@ -1450,7 +1486,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
* after_wmarks_check() is called back while the context is deactivated
* by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, false);
+ return damon_sysfs_cmd_request_callback(c, false, false);
+}
+
+static int damon_sysfs_after_sampling(struct damon_ctx *c)
+{
+ /*
+ * after_sampling() is called back only while the context is not
+ * deactivated by watermarks.
+ */
+ return damon_sysfs_cmd_request_callback(c, true, false);
}
static int damon_sysfs_after_aggregation(struct damon_ctx *c)
@@ -1459,7 +1504,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c)
* after_aggregation() is called back only while the context is not
* deactivated by watermarks.
*/
- return damon_sysfs_cmd_request_callback(c, true);
+ return damon_sysfs_cmd_request_callback(c, true, true);
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1478,6 +1523,7 @@ static struct damon_ctx *damon_sysfs_build_ctx(
}
ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+ ctx->callback.after_sampling = damon_sysfs_after_sampling;
ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
diff --git a/mm/debug.c b/mm/debug.c
index ee533a5ceb..c1c1a6a484 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -51,87 +51,105 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
};
-static void __dump_page(struct page *page)
+static void __dump_folio(struct folio *folio, struct page *page,
+ unsigned long pfn, unsigned long idx)
{
- struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
- struct address_space *mapping;
- bool compound = PageCompound(page);
- /*
- * Accessing the pageblock without the zone lock. It could change to
- * "isolate" again in the meantime, but since we are just dumping the
- * state for debugging, it should be fine to accept a bit of
- * inaccuracy here due to racing.
- */
- bool page_cma = is_migrate_cma_page(page);
- int mapcount;
+ struct address_space *mapping = folio_mapping(folio);
+ int mapcount = 0;
char *type = "";
- if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
- /*
- * Corrupt page, so we cannot call page_mapping. Instead, do a
- * safe subset of the steps that page_mapping() does. Caution:
- * this will be misleading for tail pages, PageSwapCache pages,
- * and potentially other situations. (See the page_mapping()
- * implementation for what's missing here.)
- */
- unsigned long tmp = (unsigned long)page->mapping;
-
- if (tmp & PAGE_MAPPING_ANON)
- mapping = NULL;
- else
- mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
- head = page;
- folio = (struct folio *)page;
- compound = false;
- } else {
- mapping = page_mapping(page);
- }
-
/*
- * Avoid VM_BUG_ON() in page_mapcount().
- * page->_mapcount space in struct page is used by sl[aou]b pages to
- * encode own info.
+ * page->_mapcount space in struct page is used by slab pages to
+ * encode own info, and we must avoid calling page_folio() again.
*/
- mapcount = PageSlab(head) ? 0 : page_mapcount(page);
-
- pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
- page, page_ref_count(head), mapcount, mapping,
- page_to_pgoff(page), page_to_pfn(page));
- if (compound) {
- pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
- head, compound_order(head),
+ if (!folio_test_slab(folio)) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+ }
+
+ pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+ folio_ref_count(folio), mapcount, mapping,
+ folio->index + idx, pfn);
+ if (folio_test_large(folio)) {
+ pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
+ folio_order(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
atomic_read(&folio->_pincount));
}
#ifdef CONFIG_MEMCG
- if (head->memcg_data)
- pr_warn("memcg:%lx\n", head->memcg_data);
+ if (folio->memcg_data)
+ pr_warn("memcg:%lx\n", folio->memcg_data);
#endif
- if (PageKsm(page))
+ if (folio_test_ksm(folio))
type = "ksm ";
- else if (PageAnon(page))
+ else if (folio_test_anon(folio))
type = "anon ";
else if (mapping)
dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %pGp%s\n", type, &head->flags,
- page_cma ? " CMA" : "");
- pr_warn("page_type: %pGt\n", &head->page_type);
+ /*
+ * Accessing the pageblock without the zone lock. It could change to
+ * "isolate" again in the meantime, but since we are just dumping the
+ * state for debugging, it should be fine to accept a bit of
+ * inaccuracy here due to racing.
+ */
+ pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
+ is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
+ pr_warn("page_type: %pGt\n", &folio->page.page_type);
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
- if (head != page)
+ if (folio_test_large(folio))
print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
- sizeof(unsigned long), head,
- sizeof(struct page), false);
+ sizeof(unsigned long), folio,
+ 2 * sizeof(struct page), false);
+}
+
+static void __dump_page(const struct page *page)
+{
+ struct folio *foliop, folio;
+ struct page precise;
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long idx, nr_pages = 1;
+ int loops = 5;
+
+again:
+ memcpy(&precise, page, sizeof(*page));
+ foliop = page_folio(&precise);
+ if (foliop == (struct folio *)&precise) {
+ idx = 0;
+ if (!folio_test_large(foliop))
+ goto dump;
+ foliop = (struct folio *)page;
+ } else {
+ idx = folio_page_idx(foliop, page);
+ }
+
+ if (idx < MAX_FOLIO_NR_PAGES) {
+ memcpy(&folio, foliop, 2 * sizeof(struct page));
+ nr_pages = folio_nr_pages(&folio);
+ foliop = &folio;
+ }
+
+ if (idx > nr_pages) {
+ if (loops-- > 0)
+ goto again;
+ pr_warn("page does not match folio\n");
+ precise.compound_head &= ~1UL;
+ foliop = (struct folio *)&precise;
+ idx = 0;
+ }
+
+dump:
+ __dump_folio(foliop, &precise, pfn, idx);
}
-void dump_page(struct page *page, const char *reason)
+void dump_page(const struct page *page, const char *reason)
{
if (PagePoisoned(page))
pr_warn("page:%p is uninitialized and poisoned", page);
diff --git a/mm/filemap.c b/mm/filemap.c
index b5e8dd536a..30de18c4fd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,6 +124,15 @@
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
+static void mapping_set_update(struct xa_state *xas,
+ struct address_space *mapping)
+{
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+ xas_set_update(xas, workingset_update_node);
+ xas_set_lru(xas, &shadow_nodes);
+}
+
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -843,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- int huge = folio_test_hugetlb(folio);
+ bool huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
@@ -1354,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
@@ -1912,8 +1921,6 @@ no_page:
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order == 1)
- order = 0;
if (order > 0)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
@@ -2609,15 +2616,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
- * Pairs with a barrier in
- * block_write_end()->mark_buffer_dirty() or other page
- * dirtying routines like iomap_write_end() to ensure
- * changes to page contents are visible before we see
- * increased inode size.
- */
- smp_rmb();
-
- /*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
@@ -3183,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
return fpin;
}
+static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ pte_t *ptep;
+
+ /*
+ * We might have COW'ed a pagecache folio and might now have an mlocked
+ * anon folio mapped. The original pagecache folio is not mlocked and
+ * might have been evicted. During a read+clear/modify/write update of
+ * the PTE, such as done in do_numa_page()/change_pte_range(), we
+ * temporarily clear the PTE under PT lock and might detect it here as
+ * "none" when not holding the PT lock.
+ *
+ * Not rechecking the PTE under PT lock could result in an unexpected
+ * major fault in an mlock'ed region. Recheck only for this special
+ * scenario while holding the PT lock, to not degrade non-mlocked
+ * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
+ * the number of times we hold PT lock.
+ */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return 0;
+
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return 0;
+
+ ptep = pte_offset_map(vmf->pmd, vmf->address);
+ if (unlikely(!ptep))
+ return VM_FAULT_NOPAGE;
+
+ if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
+ ret = VM_FAULT_NOPAGE;
+ } else {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_none(ptep_get(ptep))))
+ ret = VM_FAULT_NOPAGE;
+ spin_unlock(vmf->ptl);
+ }
+ pte_unmap(ptep);
+ return ret;
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@@ -3238,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
mapping_locked = true;
}
} else {
+ ret = filemap_fault_recheck_pte_none(vmf);
+ if (unlikely(ret))
+ return ret;
+
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
diff --git a/mm/gup.c b/mm/gup.c
index f6d5563574..1611e73b11 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1669,20 +1669,22 @@ long populate_vma_page_range(struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKONFAULT)
return nr_pages;
+ /* ... similarly, we've never faulted in PROT_NONE pages */
+ if (!vma_is_accessible(vma))
+ return -EFAULT;
+
gup_flags = FOLL_TOUCH;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
+ *
+ * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
+ * readable (ie write-only or executable).
*/
if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma_is_accessible(vma))
+ else
gup_flags |= FOLL_FORCE;
if (locked)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6790f93fda..769e8a125f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -790,8 +790,10 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
void folio_prep_large_rmappable(struct folio *folio)
{
- VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
- INIT_LIST_HEAD(&folio->_deferred_list);
+ if (!folio || !folio_test_large(folio))
+ return;
+ if (folio_order(folio) > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
folio_set_large_rmappable(folio);
}
@@ -1905,12 +1907,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
- struct page *page = NULL;
+ struct folio *folio = NULL;
int flush_needed = 1;
if (pmd_present(orig_pmd)) {
- page = pmd_page(orig_pmd);
- folio_remove_rmap_pmd(page_folio(page), page, vma);
+ struct page *page = pmd_page(orig_pmd);
+
+ folio = page_folio(page);
+ folio_remove_rmap_pmd(folio, page, vma);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1918,23 +1922,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(folio),
+ -HPAGE_PMD_NR);
}
spin_unlock(ptl);
if (flush_needed)
- tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+ tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
}
return 1;
}
@@ -2045,7 +2050,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@@ -2155,7 +2160,7 @@ unlock:
#ifdef CONFIG_USERFAULTFD
/*
- * The PT lock for src_pmd and the mmap_lock for reading are held by
+ * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
* the caller, but it must return after releasing the page_table_lock.
* Just move the page from src_pmd to dst_pmd if possible.
* Return zero if succeeded in moving the page, -EAGAIN if it needs to be
@@ -2178,7 +2183,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_ptl = pmd_lockptr(mm, src_pmd);
lockdep_assert_held(src_ptl);
- mmap_assert_locked(mm);
+ vma_assert_locked(src_vma);
+ vma_assert_locked(dst_vma);
/* Sanity checks before the operation */
if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
@@ -2197,13 +2203,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
}
src_page = pmd_page(src_pmdval);
- if (unlikely(!PageAnonExclusive(src_page))) {
- spin_unlock(src_ptl);
- return -EBUSY;
- }
- src_folio = page_folio(src_page);
- folio_get(src_folio);
+ if (!is_huge_zero_pmd(src_pmdval)) {
+ if (unlikely(!PageAnonExclusive(src_page))) {
+ spin_unlock(src_ptl);
+ return -EBUSY;
+ }
+
+ src_folio = page_folio(src_page);
+ folio_get(src_folio);
+ } else
+ src_folio = NULL;
+
spin_unlock(src_ptl);
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
@@ -2211,19 +2222,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- folio_lock(src_folio);
+ if (src_folio) {
+ folio_lock(src_folio);
- /*
- * split_huge_page walks the anon_vma chain without the page
- * lock. Serialize against it with the anon_vma lock, the page
- * lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- err = -EAGAIN;
- goto unlock_folio;
- }
- anon_vma_lock_write(src_anon_vma);
+ /*
+ * split_huge_page walks the anon_vma chain without the page
+ * lock. Serialize against it with the anon_vma lock, the page
+ * lock is not enough.
+ */
+ src_anon_vma = folio_get_anon_vma(src_folio);
+ if (!src_anon_vma) {
+ err = -EAGAIN;
+ goto unlock_folio;
+ }
+ anon_vma_lock_write(src_anon_vma);
+ } else
+ src_anon_vma = NULL;
dst_ptl = pmd_lockptr(mm, dst_pmd);
double_pt_lock(src_ptl, dst_ptl);
@@ -2232,45 +2246,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
err = -EAGAIN;
goto unlock_ptls;
}
- if (folio_maybe_dma_pinned(src_folio) ||
- !PageAnonExclusive(&src_folio->page)) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (src_folio) {
+ if (folio_maybe_dma_pinned(src_folio) ||
+ !PageAnonExclusive(&src_folio->page)) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
- WARN_ON_ONCE(!folio_test_anon(src_folio))) {
- err = -EBUSY;
- goto unlock_ptls;
- }
+ if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
+ WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
- /* Folio got pinned from under us. Put it back and fail the move. */
- if (folio_maybe_dma_pinned(src_folio)) {
- set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
- err = -EBUSY;
- goto unlock_ptls;
- }
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
+ err = -EBUSY;
+ goto unlock_ptls;
+ }
- folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+ folio_move_anon_rmap(src_folio, dst_vma);
+ WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
- _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
+ /* Follow mremap() behavior and treat the entry dirty after the move */
+ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
+ } else {
+ src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
+ _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
+ }
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
double_pt_unlock(src_ptl, dst_ptl);
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
+ if (src_anon_vma) {
+ anon_vma_unlock_write(src_anon_vma);
+ put_anon_vma(src_anon_vma);
+ }
unlock_folio:
/* unblock rmap walks */
- folio_unlock(src_folio);
+ if (src_folio)
+ folio_unlock(src_folio);
mmu_notifier_invalidate_range_end(&range);
- folio_put(src_folio);
+ if (src_folio)
+ folio_put(src_folio);
return err;
}
#endif /* CONFIG_USERFAULTFD */
@@ -2442,7 +2465,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -2453,7 +2476,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
folio_remove_rmap_pmd(folio, page, vma);
folio_put(folio);
}
- add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
return;
}
@@ -2470,32 +2493,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2506,6 +2508,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
@@ -2559,15 +2585,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte);
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- if (freeze || pmd_migration) {
+
+ /*
+ * Note that NUMA hinting access restrictions are not transferred to
+ * avoid any possibility of altering permissions across VMAs.
+ */
+ if (freeze || pmd_migration) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
swp_entry_t swp_entry;
+
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
@@ -2586,25 +2613,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
- } else {
- entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- if (write)
- entry = pte_mkwrite(entry, vma);
- if (!young)
- entry = pte_mkold(entry);
- /* NOTE: this may set soft-dirty too on some archs */
- if (dirty)
- entry = pte_mkdirty(entry);
- if (soft_dirty)
- entry = pte_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_mkuffd_wp(entry);
+
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
}
- VM_BUG_ON(!pte_none(ptep_get(pte)));
- set_pte_at(mm, addr, pte, entry);
- pte++;
+ } else {
+ pte_t entry;
+
+ entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
+ if (write)
+ entry = pte_mkwrite(entry, vma);
+ if (!young)
+ entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
+ if (soft_dirty)
+ entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+
+ set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
}
- pte_unmap(pte - 1);
+ pte_unmap(pte);
if (!pmd_migration)
folio_remove_rmap_pmd(folio, page, vma);
@@ -2698,11 +2732,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_folio(struct folio *folio)
{
- enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
- TTU_SYNC | TTU_BATCH_FLUSH;
+ enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
+ TTU_BATCH_FLUSH;
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (folio_test_pmd_mappable(folio))
+ ttu_flags |= TTU_SPLIT_HUGE_PMD;
+
/*
* Anon pages need migration entries to preserve them, but file
* pages can simply be left unmapped, then faulted back on demand.
@@ -2736,7 +2773,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
struct lruvec *lruvec, struct list_head *list)
{
VM_BUG_ON_PAGE(!PageHead(head), head);
- VM_BUG_ON_PAGE(PageCompound(tail), head);
VM_BUG_ON_PAGE(PageLRU(tail), head);
lockdep_assert_held(&lruvec->lru_lock);
@@ -2757,7 +2793,8 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
}
static void __split_huge_page_tail(struct folio *folio, int tail,
- struct lruvec *lruvec, struct list_head *list)
+ struct lruvec *lruvec, struct list_head *list,
+ unsigned int new_order)
{
struct page *head = &folio->page;
struct page *page_tail = head + tail;
@@ -2827,10 +2864,15 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
* which needs correct compound_head().
*/
clear_compound_head(page_tail);
+ if (new_order) {
+ prep_compound_page(page_tail, new_order);
+ folio_prep_large_rmappable(new_folio);
+ }
/* Finally unfreeze refcount. Additional reference from page cache. */
- page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) ||
- folio_test_swapcache(folio)));
+ page_ref_unfreeze(page_tail,
+ 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
+ folio_nr_pages(new_folio) : 0));
if (folio_test_young(folio))
folio_set_young(new_folio);
@@ -2848,18 +2890,20 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
}
static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end)
+ pgoff_t end, unsigned int new_order)
{
struct folio *folio = page_folio(page);
struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
- unsigned int nr = thp_nr_pages(head);
int i, nr_dropped = 0;
+ unsigned int new_nr = 1 << new_order;
+ int order = folio_order(folio);
+ unsigned int nr = 1 << order;
/* complete memcg works before add pages to LRU */
- split_page_memcg(head, nr);
+ split_page_memcg(head, order, new_order);
if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
offset = swp_offset(folio->swap);
@@ -2872,13 +2916,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
- for (i = nr - 1; i >= 1; i--) {
- __split_huge_page_tail(folio, i, lruvec, list);
+ for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+ __split_huge_page_tail(folio, i, lruvec, list, new_order);
/* Some pages can be beyond EOF: drop them from page cache */
if (head[i].index >= end) {
struct folio *tail = page_folio(head + i);
- if (shmem_mapping(head->mapping))
+ if (shmem_mapping(folio->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
folio_account_cleaned(tail,
@@ -2886,7 +2930,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
__filemap_remove_folio(tail, NULL);
folio_put(tail);
} else if (!PageAnon(page)) {
- __xa_store(&head->mapping->i_pages, head[i].index,
+ __xa_store(&folio->mapping->i_pages, head[i].index,
head + i, 0);
} else if (swap_cache) {
__xa_store(&swap_cache->i_pages, offset + i,
@@ -2894,40 +2938,55 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
}
- ClearPageCompound(head);
+ if (!new_order)
+ ClearPageCompound(head);
+ else {
+ struct folio *new_folio = (struct folio *)head;
+
+ folio_set_order(new_folio, new_order);
+ }
unlock_page_lruvec(lruvec);
/* Caller disabled irqs, so they are still disabled here */
- split_page_owner(head, nr);
+ split_page_owner(head, order, new_order);
/* See comment in __split_huge_page_tail() */
- if (PageAnon(head)) {
+ if (folio_test_anon(folio)) {
/* Additional pin to swap cache */
- if (PageSwapCache(head)) {
- page_ref_add(head, 2);
+ if (folio_test_swapcache(folio)) {
+ folio_ref_add(folio, 1 + new_nr);
xa_unlock(&swap_cache->i_pages);
} else {
- page_ref_inc(head);
+ folio_ref_inc(folio);
}
} else {
/* Additional pin to page cache */
- page_ref_add(head, 2);
- xa_unlock(&head->mapping->i_pages);
+ folio_ref_add(folio, 1 + new_nr);
+ xa_unlock(&folio->mapping->i_pages);
}
local_irq_enable();
if (nr_dropped)
- shmem_uncharge(head->mapping->host, nr_dropped);
+ shmem_uncharge(folio->mapping->host, nr_dropped);
remap_page(folio, nr);
if (folio_test_swapcache(folio))
split_swap_cluster(folio->swap);
- for (i = 0; i < nr; i++) {
+ /*
+ * set page to its compound_head when split to non order-0 pages, so
+ * we can skip unlocking it below, since PG_locked is transferred to
+ * the compound_head of the page and the caller will unlock it.
+ */
+ if (new_order)
+ page = compound_head(page);
+
+ for (i = 0; i < nr; i += new_nr) {
struct page *subpage = head + i;
+ struct folio *new_folio = page_folio(subpage);
if (subpage == page)
continue;
- unlock_page(subpage);
+ folio_unlock(new_folio);
/*
* Subpages may be freed if there wasn't any mapping
@@ -2957,29 +3016,36 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
}
/*
- * This function splits huge page into normal pages. @page can point to any
- * subpage of huge page to split. Split doesn't change the position of @page.
+ * This function splits huge page into pages in @new_order. @page can point to
+ * any subpage of huge page to split. Split doesn't change the position of
+ * @page.
+ *
+ * NOTE: order-1 anonymous folio is not supported because _deferred_list,
+ * which is used by partially mapped folios, is stored in subpage 2 and an
+ * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK,
+ * since they do not use _deferred_list.
*
* Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
* The huge page must be locked.
*
* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
*
- * Both head page and tail pages will inherit mapping, flags, and so on from
- * the hugepage.
+ * Pages in new_order will inherit mapping, flags, and so on from the hugepage.
*
- * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
- * they are not mapped.
+ * GUP pin and PG_locked transferred to @page or the compound page @page belongs
+ * to. Rest subpages can be freed if they are not mapped.
*
* Returns 0 if the hugepage is split successfully.
* Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
* us.
*/
-int split_huge_page_to_list(struct page *page, struct list_head *list)
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
{
struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
- XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+ /* reset xarray order to new order after split */
+ XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
@@ -2989,6 +3055,40 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (new_order >= folio_order(folio))
+ return -EINVAL;
+
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ if (new_order == 1) {
+ VM_WARN_ONCE(1, "Cannot split to order-1 folio");
+ return -EINVAL;
+ }
+ } else if (new_order) {
+ /* Split shmem folio to non-zero order not supported */
+ if (shmem_mapping(folio->mapping)) {
+ VM_WARN_ONCE(1,
+ "Cannot split shmem folio to non-0 order");
+ return -EINVAL;
+ }
+ /*
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
+ */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ VM_WARN_ONCE(1,
+ "Cannot split file folio to non-0 order");
+ return -EINVAL;
+ }
+ }
+
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio) && new_order)
+ return -EINVAL;
+
is_hzp = is_huge_zero_page(&folio->page);
if (is_hzp) {
pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
@@ -3082,16 +3182,24 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- if (!list_empty(&folio->_deferred_list)) {
+ if (folio_order(folio) > 1 &&
+ !list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ /*
+ * Reinitialize page_deferred_list after removing the
+ * page from the split_queue, otherwise a subsequent
+ * split will see list corruption when checking the
+ * page_deferred_list.
+ */
+ list_del_init(&folio->_deferred_list);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
int nr = folio_nr_pages(folio);
xas_split(&xas, folio, folio_order(folio));
- if (folio_test_pmd_mappable(folio)) {
+ if (folio_test_pmd_mappable(folio) &&
+ new_order < HPAGE_PMD_ORDER) {
if (folio_test_swapbacked(folio)) {
__lruvec_stat_mod_folio(folio,
NR_SHMEM_THPS, -nr);
@@ -3103,7 +3211,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
}
- __split_huge_page(page, list, end);
+ __split_huge_page(page, list, end, new_order);
ret = 0;
} else {
spin_unlock(&ds_queue->split_queue_lock);
@@ -3133,6 +3241,9 @@ void folio_undo_large_rmappable(struct folio *folio)
struct deferred_split *ds_queue;
unsigned long flags;
+ if (folio_order(folio) <= 1)
+ return;
+
/*
* At this point, there is no one trying to add the folio to
* deferred_list. If folio is not in deferred_list, it's safe
@@ -3158,7 +3269,12 @@ void deferred_split_folio(struct folio *folio)
#endif
unsigned long flags;
- VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
+ /*
+ * Order 1 folios have no space for a deferred list, but we also
+ * won't waste much memory by not adding them to the deferred list.
+ */
+ if (folio_order(folio) <= 1)
+ return;
/*
* The try_to_unmap() in page reclaim path might reach here too,
@@ -3316,7 +3432,7 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
- unsigned long vaddr_end)
+ unsigned long vaddr_end, unsigned int new_order)
{
int ret = 0;
struct task_struct *task;
@@ -3379,14 +3495,23 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!is_transparent_hugepage(folio))
goto next;
+ if (new_order >= folio_order(folio))
+ goto next;
+
total++;
- if (!can_split_folio(folio, NULL))
+ /*
+ * For folios with private, split_huge_page_to_list_to_order()
+ * will try to drop it before split and then check if the folio
+ * can be split or not. So skip the check here.
+ */
+ if (!folio_test_private(folio) &&
+ !can_split_folio(folio, NULL))
goto next;
if (!folio_trylock(folio))
goto next;
- if (!split_folio(folio))
+ if (!split_folio_to_order(folio, new_order))
split++;
folio_unlock(folio);
@@ -3404,7 +3529,7 @@ out:
}
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
- pgoff_t off_end)
+ pgoff_t off_end, unsigned int new_order)
{
struct filename *file;
struct file *candidate;
@@ -3440,10 +3565,13 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
total++;
nr_pages = folio_nr_pages(folio);
+ if (new_order >= folio_order(folio))
+ goto next;
+
if (!folio_trylock(folio))
goto next;
- if (!split_folio(folio))
+ if (!split_folio_to_order(folio, new_order))
split++;
folio_unlock(folio);
@@ -3468,10 +3596,14 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
{
static DEFINE_MUTEX(split_debug_mutex);
ssize_t ret;
- /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+ /*
+ * hold pid, start_vaddr, end_vaddr, new_order or
+ * file_path, off_start, off_end, new_order
+ */
char input_buf[MAX_INPUT_BUF_SZ];
int pid;
unsigned long vaddr_start, vaddr_end;
+ unsigned int new_order = 0;
ret = mutex_lock_interruptible(&split_debug_mutex);
if (ret)
@@ -3500,29 +3632,29 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
goto out;
}
- ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
- if (ret != 2) {
+ ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
+ if (ret != 2 && ret != 3) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_in_file(file_path, off_start, off_end);
+ ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
if (!ret)
ret = input_len;
goto out;
}
- ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf);
goto out;
- } else if (ret != 3) {
+ } else if (ret != 3 && ret != 4) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
if (!ret)
ret = strlen(input_buf);
out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c2b9ba3a54..c445e6fd85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,6 +35,7 @@
#include <linux/delayacct.h>
#include <linux/memory.h>
#include <linux/mm_inline.h>
+#include <linux/padata.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -68,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
#endif
static unsigned long hugetlb_cma_size __initdata;
-__initdata LIST_HEAD(huge_boot_pages);
+__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -1464,15 +1465,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
* next node from which to allocate, handling wrap at end of node
* mask.
*/
-static int hstate_next_node_to_alloc(struct hstate *h,
+static int hstate_next_node_to_alloc(int *next_node,
nodemask_t *nodes_allowed)
{
int nid;
VM_BUG_ON(!nodes_allowed);
- nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
- h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+ nid = get_valid_node_allowed(*next_node, nodes_allowed);
+ *next_node = next_node_allowed(nid, nodes_allowed);
return nid;
}
@@ -1495,10 +1496,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
return nid;
}
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
for (nr_nodes = nodes_weight(*mask); \
nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
nr_nodes--)
#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
@@ -2334,12 +2335,13 @@ static void prep_and_add_allocated_folios(struct hstate *h,
*/
static struct folio *alloc_pool_huge_folio(struct hstate *h,
nodemask_t *nodes_allowed,
- nodemask_t *node_alloc_noretry)
+ nodemask_t *node_alloc_noretry,
+ int *next_node)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nr_nodes, node;
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
struct folio *folio;
folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
@@ -3013,21 +3015,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = folio_nid(old_folio);
- struct folio *new_folio;
+ struct folio *new_folio = NULL;
int ret = 0;
- /*
- * Before dissolving the folio, we need to allocate a new one for the
- * pool to remain stable. Here, we allocate the folio and 'prep' it
- * by doing everything but actually updating counters and adding to
- * the pool. This simplifies and let us do most of the processing
- * under the lock.
- */
- new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
- if (!new_folio)
- return -ENOMEM;
- __prep_new_hugetlb_folio(h, new_folio);
-
retry:
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
@@ -3057,6 +3047,16 @@ retry:
cond_resched();
goto retry;
} else {
+ if (!new_folio) {
+ spin_unlock_irq(&hugetlb_lock);
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
+ NULL, NULL);
+ if (!new_folio)
+ return -ENOMEM;
+ __prep_new_hugetlb_folio(h, new_folio);
+ goto retry;
+ }
+
/*
* Ok, old_folio is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
@@ -3084,9 +3084,11 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
- /* Folio has a zero ref count, but needs a ref to be freed */
- folio_ref_unfreeze(new_folio, 1);
- update_and_free_hugetlb_folio(h, new_folio, false);
+ if (new_folio) {
+ /* Folio has a zero ref count, but needs a ref to be freed */
+ folio_ref_unfreeze(new_folio, 1);
+ update_and_free_hugetlb_folio(h, new_folio, false);
+ }
return ret;
}
@@ -3286,7 +3288,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid)
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
struct huge_bootmem_page *m = NULL; /* initialize for clang */
- int nr_nodes, node;
+ int nr_nodes, node = nid;
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
@@ -3297,7 +3299,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
goto found;
}
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
@@ -3324,7 +3326,7 @@ found:
huge_page_size(h) - PAGE_SIZE);
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages);
+ list_add(&m->list, &huge_boot_pages[node]);
m->hstate = h;
return 1;
}
@@ -3375,8 +3377,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
/* Send list for bulk vmemmap optimization processing */
hugetlb_vmemmap_optimize_folios(h, folio_list);
- /* Add all new pool pages to free lists in one lock cycle */
- spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
/*
@@ -3389,23 +3389,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ /* Subdivide locks to achieve better parallel performance */
+ spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
enqueue_hugetlb_folio(h, folio);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
}
- spin_unlock_irqrestore(&hugetlb_lock, flags);
}
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
*/
-static void __init gather_bootmem_prealloc(void)
+static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
LIST_HEAD(folio_list);
struct huge_bootmem_page *m;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages, list) {
+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
@@ -3438,6 +3440,31 @@ static void __init gather_bootmem_prealloc(void)
prep_and_add_bootmem_folios(h, &folio_list);
}
+static void __init gather_bootmem_prealloc_parallel(unsigned long start,
+ unsigned long end, void *arg)
+{
+ int nid;
+
+ for (nid = start; nid < end; nid++)
+ gather_bootmem_prealloc_node(nid);
+}
+
+static void __init gather_bootmem_prealloc(void)
+{
+ struct padata_mt_job job = {
+ .thread_fn = gather_bootmem_prealloc_parallel,
+ .fn_arg = NULL,
+ .start = 0,
+ .size = num_node_state(N_MEMORY),
+ .align = 1,
+ .min_chunk = 1,
+ .max_threads = num_node_state(N_MEMORY),
+ .numa_aware = true,
+ };
+
+ padata_do_multithreaded(&job);
+}
+
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
@@ -3469,6 +3496,108 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
h->max_huge_pages_node[nid] = i;
}
+static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
+{
+ int i;
+ bool node_specific_alloc = false;
+
+ for_each_online_node(i) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+
+ return node_specific_alloc;
+}
+
+static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
+{
+ if (allocated < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, allocated);
+ h->max_huge_pages = allocated;
+ }
+}
+
+static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
+{
+ struct hstate *h = (struct hstate *)arg;
+ int i, num = end - start;
+ nodemask_t node_alloc_noretry;
+ LIST_HEAD(folio_list);
+ int next_node = first_online_node;
+
+ /* Bit mask controlling how hard we retry per-node allocations.*/
+ nodes_clear(node_alloc_noretry);
+
+ for (i = 0; i < num; ++i) {
+ struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ &node_alloc_noretry, &next_node);
+ if (!folio)
+ break;
+
+ list_move(&folio->lru, &folio_list);
+ cond_resched();
+ }
+
+ prep_and_add_allocated_folios(h, &folio_list);
+}
+
+static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
+{
+ unsigned long i;
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
+ break;
+ cond_resched();
+ }
+
+ return i;
+}
+
+static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
+{
+ struct padata_mt_job job = {
+ .fn_arg = h,
+ .align = 1,
+ .numa_aware = true
+ };
+
+ job.thread_fn = hugetlb_pages_alloc_boot_node;
+ job.start = 0;
+ job.size = h->max_huge_pages;
+
+ /*
+ * job.max_threads is twice the num_node_state(N_MEMORY),
+ *
+ * Tests below indicate that a multiplier of 2 significantly improves
+ * performance, and although larger values also provide improvements,
+ * the gains are marginal.
+ *
+ * Therefore, choosing 2 as the multiplier strikes a good balance between
+ * enhancing parallel processing capabilities and maintaining efficient
+ * resource management.
+ *
+ * +------------+-------+-------+-------+-------+-------+
+ * | multiplier | 1 | 2 | 3 | 4 | 5 |
+ * +------------+-------+-------+-------+-------+-------+
+ * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
+ * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms |
+ * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms |
+ * +------------+-------+-------+-------+-------+-------+
+ */
+ job.max_threads = num_node_state(N_MEMORY) * 2;
+ job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+ padata_do_multithreaded(&job);
+
+ return h->nr_huge_pages;
+}
+
/*
* NOTE: this routine is called in different contexts for gigantic and
* non-gigantic pages.
@@ -3482,11 +3611,8 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
*/
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
- unsigned long i;
- struct folio *folio;
- LIST_HEAD(folio_list);
- nodemask_t *node_alloc_noretry;
- bool node_specific_alloc = false;
+ unsigned long allocated;
+ static bool initialized __initdata;
/* skip gigantic hugepages allocation if hugetlb_cma enabled */
if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3494,66 +3620,26 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
- /* do node specific alloc */
- for_each_online_node(i) {
- if (h->max_huge_pages_node[i] > 0) {
- hugetlb_hstate_alloc_pages_onenode(h, i);
- node_specific_alloc = true;
- }
+ /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
+ if (!initialized) {
+ int i = 0;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+ initialized = true;
}
- if (node_specific_alloc)
+ /* do node specific alloc */
+ if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
/* below will do all node balanced alloc */
- if (!hstate_is_gigantic(h)) {
- /*
- * Bit mask controlling how hard we retry per-node allocations.
- * Ignore errors as lower level routines can deal with
- * node_alloc_noretry == NULL. If this kmalloc fails at boot
- * time, we are likely in bigger trouble.
- */
- node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
- GFP_KERNEL);
- } else {
- /* allocations done at boot time */
- node_alloc_noretry = NULL;
- }
-
- /* bit mask controlling how hard we retry per-node allocations */
- if (node_alloc_noretry)
- nodes_clear(*node_alloc_noretry);
-
- for (i = 0; i < h->max_huge_pages; ++i) {
- if (hstate_is_gigantic(h)) {
- /*
- * gigantic pages not added to list as they are not
- * added to pools now.
- */
- if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
- break;
- } else {
- folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
- node_alloc_noretry);
- if (!folio)
- break;
- list_add(&folio->lru, &folio_list);
- }
- cond_resched();
- }
-
- /* list will be empty if hstate_is_gigantic */
- prep_and_add_allocated_folios(h, &folio_list);
-
- if (i < h->max_huge_pages) {
- char buf[32];
+ if (hstate_is_gigantic(h))
+ allocated = hugetlb_gigantic_pages_alloc_boot(h);
+ else
+ allocated = hugetlb_pages_alloc_boot(h);
- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
- pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
- h->max_huge_pages, buf, i);
- h->max_huge_pages = i;
- }
- kfree(node_alloc_noretry);
+ hugetlb_hstate_alloc_pages_errcheck(allocated, h);
}
static void __init hugetlb_init_hstates(void)
@@ -3655,7 +3741,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
if (h->surplus_huge_pages_node[node])
goto found;
}
@@ -3770,7 +3856,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
cond_resched();
folio = alloc_pool_huge_folio(h, nodes_allowed,
- node_alloc_noretry);
+ node_alloc_noretry,
+ &h->next_nid_to_alloc);
if (!folio) {
prep_and_add_allocated_folios(h, &page_list);
spin_lock_irq(&hugetlb_lock);
@@ -5572,6 +5659,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ bool adjust_reservation = false;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5664,7 +5752,43 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
hugetlb_count_sub(pages_per_huge_page(h), mm);
hugetlb_remove_rmap(page_folio(page));
+ /*
+ * Restore the reservation for anonymous page, otherwise the
+ * backing page could be stolen by someone.
+ * If there we are freeing a surplus, do not set the restore
+ * reservation bit.
+ */
+ if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
+ folio_test_anon(page_folio(page))) {
+ folio_set_hugetlb_restore_reserve(page_folio(page));
+ /* Reservation to be adjusted after the spin lock */
+ adjust_reservation = true;
+ }
+
spin_unlock(ptl);
+
+ /*
+ * Adjust the reservation for the region that will have the
+ * reserve restored. Keep in mind that vma_needs_reservation() changes
+ * resv->adds_in_progress if it succeeds. If this is not done,
+ * do_exit() will not see it, and will keep the reservation
+ * forever.
+ */
+ if (adjust_reservation) {
+ int rc = vma_needs_reservation(h, vma, address);
+
+ if (rc < 0)
+ /* Pressumably allocate_file_region_entries failed
+ * to allocate a file_region struct. Clear
+ * hugetlb_restore_reserve so that global reserve
+ * count will not be incremented by free_huge_folio.
+ * Act as if we consumed the reservation.
+ */
+ folio_clear_hugetlb_restore_reserve(page_folio(page));
+ else if (rc)
+ vma_add_reservation(h, vma, address);
+ }
+
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
* Bail out after unmapping reference page if supplied
@@ -5813,7 +5937,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags,
- struct folio *pagecache_folio, spinlock_t *ptl)
+ struct folio *pagecache_folio, spinlock_t *ptl,
+ struct vm_fault *vmf)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte = huge_ptep_get(ptep);
@@ -5947,10 +6072,9 @@ retry_avoidcopy:
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
- if (unlikely(anon_vma_prepare(vma))) {
- ret = VM_FAULT_OOM;
+ ret = vmf_anon_prepare(vmf);
+ if (unlikely(ret))
goto out_release_all;
- }
if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
ret = VM_FAULT_HWPOISON_LARGE;
@@ -6047,39 +6171,21 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
return 0;
}
-static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
struct address_space *mapping,
- pgoff_t idx,
- unsigned int flags,
- unsigned long haddr,
- unsigned long addr,
unsigned long reason)
{
u32 hash;
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .real_address = addr,
- .flags = flags,
-
- /*
- * Hard to debug if it ends up being
- * used by a callee that assumes
- * something about the other
- * uninitialized fields... same as in
- * memory.c
- */
- };
/*
* vma_lock and hugetlb_fault_mutex must be dropped before handling
* userfault. Also mmap_lock could be dropped due to handling
* userfault, any vma operation should be careful from here.
*/
- hugetlb_vma_unlock_read(vma);
- hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vmf->vma);
+ hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- return handle_userfault(&vmf, reason);
+ return handle_userfault(vmf, reason);
}
/*
@@ -6103,7 +6209,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep,
- pte_t old_pte, unsigned int flags)
+ pte_t old_pte, unsigned int flags,
+ struct vm_fault *vmf)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -6162,11 +6269,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
}
- return hugetlb_handle_userfault(vma, mapping, idx, flags,
- haddr, address,
+ return hugetlb_handle_userfault(vmf, mapping,
VM_UFFD_MISSING);
}
+ if (!(vma->vm_flags & VM_MAYSHARE)) {
+ ret = vmf_anon_prepare(vmf);
+ if (unlikely(ret))
+ goto out;
+ }
+
folio = alloc_hugetlb_folio(vma, haddr, 0);
if (IS_ERR(folio)) {
/*
@@ -6203,15 +6315,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
*/
restore_reserve_on_error(h, vma, haddr, folio);
folio_put(folio);
+ ret = VM_FAULT_SIGBUS;
goto out;
}
new_pagecache_folio = true;
} else {
folio_lock(folio);
- if (unlikely(anon_vma_prepare(vma))) {
- ret = VM_FAULT_OOM;
- goto backout_unlocked;
- }
anon_rmap = 1;
}
} else {
@@ -6235,8 +6344,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
ret = 0;
goto out;
}
- return hugetlb_handle_userfault(vma, mapping, idx, flags,
- haddr, address,
+ return hugetlb_handle_userfault(vmf, mapping,
VM_UFFD_MINOR);
}
}
@@ -6279,7 +6387,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf);
}
spin_unlock(ptl);
@@ -6340,19 +6448,25 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
- pgoff_t idx;
struct folio *folio = NULL;
struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ .pgoff = vma_hugecache_offset(h, vma, haddr),
+ /* TODO: Track hugetlb faults using vm_fault */
- /* TODO: Handle faults under the VMA lock */
- if (flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
+ /*
+ * Some fields may not be initialized, be careful as it may
+ * be hard to debug if called functions make assumptions
+ */
+ };
/*
* Serialize hugepage allocation and instantiation, so that we don't
@@ -6360,8 +6474,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* the same page in the page cache.
*/
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -6395,8 +6508,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
- return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
- entry, flags);
+ return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
+ ptep, entry, flags, &vmf);
}
ret = 0;
@@ -6442,7 +6555,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+ pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
+ vmf.pgoff);
if (IS_ERR(pagecache_folio))
pagecache_folio = NULL;
}
@@ -6457,13 +6571,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (!userfaultfd_wp_async(vma)) {
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .real_address = address,
- .flags = flags,
- };
-
spin_unlock(ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
@@ -6497,7 +6604,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_folio, ptl);
+ pagecache_folio, ptl, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
entry = huge_pte_mkdirty(entry);
@@ -6675,11 +6782,20 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
/*
- * The memory barrier inside __folio_mark_uptodate makes sure that
- * preceding stores to the page contents become visible before
- * the set_pte_at() write.
+ * If we just allocated a new page, we need a memory barrier to ensure
+ * that preceding stores to the page become visible before the
+ * set_pte_at() write. The memory barrier inside __folio_mark_uptodate
+ * is what we need.
+ *
+ * In the case where we have not allocated a new page (is_continue),
+ * the page must already be uptodate. UFFDIO_CONTINUE already includes
+ * an earlier smp_wmb() to ensure that prior stores will be visible
+ * before the set_pte_at() write.
*/
- __folio_mark_uptodate(folio);
+ if (!is_continue)
+ __folio_mark_uptodate(folio);
+ else
+ WARN_ON_ONCE(!folio_test_uptodate(folio));
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
@@ -7686,6 +7802,13 @@ void __init hugetlb_cma_reserve(int order)
bool node_specific_cma_alloc = false;
int nid;
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
cma_reserve_called = true;
if (!hugetlb_cma_size)
@@ -7756,9 +7879,9 @@ void __init hugetlb_cma_reserve(int order)
* huge page demotion.
*/
res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << HUGETLB_PAGE_ORDER,
- 0, false, name,
- &hugetlb_cma[nid], nid);
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
+ HUGETLB_PAGE_ORDER, false, name,
+ &hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
res, nid);
diff --git a/mm/internal.h b/mm/internal.h
index c3f3e0f191..07ad2675a8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,6 +83,99 @@ static inline void *folio_raw_mapping(struct folio *folio)
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}
+#ifdef CONFIG_MMU
+
+/* Flags for folio_pte_batch(). */
+typedef int __bitwise fpb_t;
+
+/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
+#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
+
+/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
+#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
+
+static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
+{
+ if (flags & FPB_IGNORE_DIRTY)
+ pte = pte_mkclean(pte);
+ if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+ pte = pte_clear_soft_dirty(pte);
+ return pte_wrprotect(pte_mkold(pte));
+}
+
+/**
+ * folio_pte_batch - detect a PTE batch for a large folio
+ * @folio: The large folio to detect a PTE batch for.
+ * @addr: The user virtual address the first page is mapped at.
+ * @start_ptep: Page table pointer for the first entry.
+ * @pte: Page table entry for the first page.
+ * @max_nr: The maximum number of table entries to consider.
+ * @flags: Flags to modify the PTE batch semantics.
+ * @any_writable: Optional pointer to indicate whether any entry except the
+ * first one is writable.
+ *
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same large folio.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
+ * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ *
+ * start_ptep must map any page of the folio. max_nr must be at least one and
+ * must be limited by the caller so scanning cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+ pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+ bool *any_writable)
+{
+ unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t expected_pte, *ptep;
+ bool writable;
+ int nr;
+
+ if (any_writable)
+ *any_writable = false;
+
+ VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+ VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+
+ nr = pte_batch_hint(start_ptep, pte);
+ expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+ ptep = start_ptep + nr;
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+ if (any_writable)
+ writable = !!pte_write(pte);
+ pte = __pte_batch_clear_ignored(pte, flags);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ /*
+ * Stop immediately once we reached the end of the folio. In
+ * corner cases the next PFN might fall into a different
+ * folio.
+ */
+ if (pte_pfn(pte) >= folio_end_pfn)
+ break;
+
+ if (any_writable)
+ *any_writable |= writable;
+
+ nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, nr);
+ ptep += nr;
+ }
+
+ return min(ptep - start_ptep, max_nr);
+}
+#endif /* CONFIG_MMU */
+
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
@@ -103,6 +196,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat)
wake_up(wqh);
}
+vm_fault_t vmf_anon_prepare(struct vm_fault *vmf);
vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
@@ -419,8 +513,7 @@ static inline struct folio *page_rmappable_folio(struct page *page)
{
struct folio *folio = (struct folio *)page;
- if (folio && folio_order(folio) > 1)
- folio_prep_large_rmappable(folio);
+ folio_prep_large_rmappable(folio);
return folio;
}
@@ -447,10 +540,12 @@ extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
+extern bool free_pages_prepare(struct page *page, unsigned int order);
+
extern int user_min_free_kbytes;
-extern void free_unref_page(struct page *page, unsigned int order);
-extern void free_unref_page_list(struct list_head *list);
+void free_unref_page(struct page *page, unsigned int order);
+void free_unref_folios(struct folio_batch *fbatch);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
@@ -481,7 +576,7 @@ int split_free_page(struct page *free_page,
* completes when free_pfn <= migrate_pfn
*/
struct compact_control {
- struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
unsigned int nr_freepages; /* Number of isolated free pages */
unsigned int nr_migratepages; /* Number of pages to migrate */
@@ -537,7 +632,8 @@ isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end,
+ int migratetype);
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);
@@ -865,7 +961,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-unsigned long reclaim_pages(struct list_head *folio_list);
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
@@ -1116,6 +1212,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
extern bool mirrored_kernelcore;
extern bool memblock_has_mirror(void);
+static __always_inline void vma_set_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff)
+{
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+}
+
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
/*
@@ -1268,4 +1373,8 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
}
#endif /* CONFIG_SHRINKER_DEBUG */
+/* Only track the nodes of mappings with shadow entries */
+void workingset_update_node(struct xa_node *node);
+extern struct list_lru shadow_nodes;
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6ca63e8dda..e7c9a4dc89 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack)
u64 ts_nsec = local_clock();
track->cpu = cpu;
- track->timestamp = ts_nsec >> 3;
+ track->timestamp = ts_nsec >> 9;
#endif /* CONFIG_KASAN_EXTRA_INFO */
track->pid = current->pid;
track->stack = stack;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 1900f85760..6310a18027 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -334,14 +334,6 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
-/* Only allow cache merging when no per-object metadata is present. */
-slab_flags_t kasan_never_merge(void)
-{
- if (!kasan_requires_meta())
- return 0;
- return SLAB_KASAN;
-}
-
/*
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used.
@@ -370,15 +362,13 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
/*
- * SLAB_KASAN is used to mark caches that are sanitized by KASAN
- * and that thus have per-object metadata.
- * Currently this flag is used in two places:
- * 1. In slab_ksize() to account for per-object metadata when
- * calculating the size of the accessible memory within the object.
- * 2. In slab_common.c via kasan_never_merge() to prevent merging of
- * caches with per-object metadata.
+ * SLAB_KASAN is used to mark caches that are sanitized by KASAN and
+ * that thus have per-object metadata. Currently, this flag is used in
+ * slab_ksize() to account for per-object metadata when calculating the
+ * size of the accessible memory within the object. Additionally, we use
+ * SLAB_NO_MERGE to prevent merging of caches with per-object metadata.
*/
- *flags |= SLAB_KASAN;
+ *flags |= SLAB_KASAN | SLAB_NO_MERGE;
ok_size = *size;
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index e26a2583a6..7b32be2a3c 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -698,6 +698,84 @@ static void kmalloc_uaf3(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
}
+static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe)
+{
+ int *i_unsafe = unsafe;
+
+ KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe));
+ KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe));
+}
+
+static void kasan_atomics(struct kunit *test)
+{
+ void *a1, *a2;
+
+ /*
+ * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such
+ * that the following 16 bytes will make up the redzone.
+ */
+ a1 = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1);
+ a2 = kzalloc(sizeof(atomic_long_t), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a2);
+
+ /* Use atomics to access the redzone. */
+ kasan_atomics_helper(test, a1 + 48, a2);
+
+ kfree(a1);
+ kfree(a2);
+}
+
static void kmalloc_double_kzfree(struct kunit *test)
{
char *ptr;
@@ -1884,6 +1962,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_strings),
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kasan_atomics),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index 8b7b3ea2c7..27ec22767e 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static int __init test_kasan_module_init(void)
+static int __init kasan_test_module_init(void)
{
/*
* Temporarily enable multi-shot mode. Otherwise, KASAN would only
@@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void)
return -EAGAIN;
}
-module_init(test_kasan_module_init);
+module_init(kasan_test_module_init);
MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7afa4feb03..b48c768acc 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
u64 ts_nsec = track->timestamp;
unsigned long rem_usec;
- ts_nsec <<= 3;
+ ts_nsec <<= 9;
rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000;
pr_err("%s by task %u on cpu %d at %lu.%06lus:\n",
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 9ef84f3183..d6210ca48d 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -199,19 +199,12 @@ static bool shadow_mapped(unsigned long addr)
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return false;
-
- /*
- * We can't use pud_large() or pud_huge(), the first one is
- * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
- * pud_bad(), if pud is bad then it's bad because it's huge.
- */
- if (pud_bad(*pud))
+ if (pud_leaf(*pud))
return true;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return false;
-
- if (pmd_bad(*pmd))
+ if (pmd_leaf(*pmd))
return true;
pte = pte_offset_kernel(pmd, addr);
return !pte_none(ptep_get(pte));
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb52..3883017460 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+{
+ return hpage_collapse_test_exit(mm) ||
+ test_bit(MMF_DISABLE_THP, &mm->flags);
+}
+
void __khugepaged_enter(struct mm_struct *mm)
{
struct khugepaged_mm_slot *mm_slot;
@@ -683,9 +689,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
- struct folio *src_folio;
- struct page *src_page;
- struct page *tmp;
+ struct folio *src, *tmp;
pte_t *_pte;
pte_t pteval;
@@ -704,10 +708,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
ksm_might_unmap_zero_page(vma->vm_mm, pteval);
}
} else {
- src_page = pte_page(pteval);
- src_folio = page_folio(src_page);
- if (!folio_test_large(src_folio))
- release_pte_folio(src_folio);
+ struct page *src_page = pte_page(pteval);
+
+ src = page_folio(src_page);
+ if (!folio_test_large(src))
+ release_pte_folio(src);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
@@ -715,20 +720,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src_folio, src_page, vma);
+ folio_remove_rmap_pte(src, src_page, vma);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
}
- list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
- list_del(&src_page->lru);
- mod_node_page_state(page_pgdat(src_page),
- NR_ISOLATED_ANON + page_is_file_lru(src_page),
- -compound_nr(src_page));
- unlock_page(src_page);
- free_swap_cache(src_page);
- putback_lru_page(src_page);
+ list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
+ list_del(&src->lru);
+ node_stat_sub_folio(src, NR_ISOLATED_ANON +
+ folio_is_file_lru(src));
+ folio_unlock(src);
+ free_swap_cache(src);
+ folio_putback_lru(src);
}
}
@@ -914,7 +918,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
{
struct vm_area_struct *vma;
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -1634,7 +1638,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
/* step 4: remove empty page table */
@@ -1665,7 +1669,7 @@ abort:
if (nr_ptes) {
flush_tlb_mm(mm);
folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
@@ -2360,7 +2364,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
goto breakouterloop_mmap_lock;
progress++;
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
goto breakouterloop;
vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2368,7 +2372,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
unsigned long hstart, hend;
cond_resched();
- if (unlikely(hpage_collapse_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
progress++;
break;
}
@@ -2390,7 +2394,7 @@ skip:
bool mmap_locked = true;
cond_resched();
- if (unlikely(hpage_collapse_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2408,7 +2412,7 @@ skip:
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
- if (hpage_collapse_test_exit(mm))
+ if (hpage_collapse_test_exit_or_disable(mm))
goto breakouterloop;
*result = collapse_pte_mapped_thp(mm,
khugepaged_scan.address, false);
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index cf2d70e9c9..95f859e38c 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -196,8 +196,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
u32 origin, bool checked)
{
u64 address = (u64)addr;
- void *shadow_start;
- u32 *origin_start;
+ u32 *shadow_start, *origin_start;
size_t pad = 0;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -225,8 +224,16 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
origin_start =
(u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN);
- for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++)
- origin_start[i] = origin;
+ /*
+ * If the new origin is non-zero, assume that the shadow byte is also non-zero,
+ * and unconditionally overwrite the old origin slot.
+ * If the new origin is zero, overwrite the old origin slot iff the
+ * corresponding shadow slot is zero.
+ */
+ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
+ if (origin || !shadow_start[i])
+ origin_start[i] = origin;
+ }
}
struct page *kmsan_vmalloc_to_page_or_null(void *vaddr)
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 5d6e2dee56..0b09daa188 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
}
/* Functions from kmsan-checks.h follow. */
+
+/*
+ * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it
+ * into the stack depot. This may cause deadlocks if done from within KMSAN
+ * runtime, therefore we bail out if kmsan_in_runtime().
+ */
void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
{
if (!kmsan_enabled || kmsan_in_runtime())
@@ -371,47 +377,31 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
}
EXPORT_SYMBOL(kmsan_poison_memory);
+/*
+ * Unlike kmsan_poison_memory(), this function can be used from within KMSAN
+ * runtime, because it does not trigger allocations or call instrumented code.
+ */
void kmsan_unpoison_memory(const void *address, size_t size)
{
unsigned long ua_flags;
- if (!kmsan_enabled || kmsan_in_runtime())
+ if (!kmsan_enabled)
return;
ua_flags = user_access_save();
- kmsan_enter_runtime();
/* The users may want to poison/unpoison random memory. */
kmsan_internal_unpoison_memory((void *)address, size,
KMSAN_POISON_NOCHECK);
- kmsan_leave_runtime();
user_access_restore(ua_flags);
}
EXPORT_SYMBOL(kmsan_unpoison_memory);
/*
- * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
- * runtime.
- *
- * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
- * code. Those regs need to be unpoisoned, otherwise using them will result in
- * false positives.
- * Using kmsan_unpoison_memory() is not an option in entry code, because the
- * return value of in_task() is inconsistent - as a result, certain calls to
- * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
- * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
- * entry code.
+ * Version of kmsan_unpoison_memory() called from IRQ entry functions.
*/
void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
{
- unsigned long ua_flags;
-
- if (!kmsan_enabled)
- return;
-
- ua_flags = user_access_save();
- kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
- KMSAN_POISON_NOCHECK);
- user_access_restore(ua_flags);
+ kmsan_unpoison_memory((void *)regs, sizeof(*regs));
}
void kmsan_check_memory(const void *addr, size_t size)
diff --git a/mm/ksm.c b/mm/ksm.c
index 8c001819cf..ddd482c718 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -296,7 +296,7 @@ static bool ksm_use_zero_pages __read_mostly;
static bool ksm_smart_scan = true;
/* The number of zero pages which is placed by KSM */
-unsigned long ksm_zero_pages;
+atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);
/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;
@@ -1428,8 +1428,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* the dirty bit in zero page's PTE is set.
*/
newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
- ksm_zero_pages++;
- mm->ksm_zero_pages++;
+ ksm_map_zero_page(mm);
/*
* We're replacing an anonymous page with a zero page, which is
* not anonymous. We need to do proper accounting otherwise we
@@ -2747,18 +2746,16 @@ static void ksm_do_scan(unsigned int scan_npages)
{
struct ksm_rmap_item *rmap_item;
struct page *page;
- unsigned int npages = scan_npages;
- while (npages-- && likely(!freezing(current))) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item);
put_page(page);
+ ksm_pages_scanned++;
}
-
- ksm_pages_scanned += scan_npages - npages;
}
static int ksmd_should_run(void)
@@ -3370,7 +3367,7 @@ static void wait_while_offlining(void)
#ifdef CONFIG_PROC_FS
long ksm_process_profit(struct mm_struct *mm)
{
- return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
+ return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */
@@ -3659,7 +3656,7 @@ KSM_ATTR_RO(pages_skipped);
static ssize_t ksm_zero_pages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
}
KSM_ATTR_RO(ksm_zero_pages);
@@ -3668,7 +3665,7 @@ static ssize_t general_profit_show(struct kobject *kobj,
{
long general_profit;
- general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
+ general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
ksm_rmap_items * sizeof(struct ksm_rmap_item);
return sysfs_emit(buf, "%ld\n", general_profit);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 35b0147542..3fd64736bc 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg)
-{
- struct list_lru_one *list =
- list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
-
- if (list_empty(item)) {
- list_add_tail(item, &list->list);
- if (!list->nr_items++)
- set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
- }
-}
-EXPORT_SYMBOL_GPL(list_lru_putback);
-
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
@@ -257,6 +243,9 @@ restart:
*/
assert_spin_locked(&nlru->lock);
goto restart;
+ case LRU_STOP:
+ assert_spin_locked(&nlru->lock);
+ goto out;
default:
BUG();
}
@@ -567,6 +556,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
lru->shrinker_id = shrinker->id;
else
lru->shrinker_id = -1;
+
+ if (mem_cgroup_kmem_disabled())
+ memcg_aware = false;
#endif
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
diff --git a/mm/madvise.c b/mm/madvise.c
index f2c818af0b..1a073fcc4c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_young(orig_pmd)) {
+ if (!pageout && pmd_young(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
@@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&folio_list);
+ reclaim_pages(&folio_list, true);
return 0;
}
@@ -453,7 +453,7 @@ restart:
if (folio_test_large(folio)) {
int err;
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_estimated_sharers(folio) > 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
@@ -490,7 +490,7 @@ restart:
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- if (pte_young(ptent)) {
+ if (!pageout && pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
ptent = pte_mkold(ptent);
@@ -524,7 +524,7 @@ restart:
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
- reclaim_pages(&folio_list);
+ reclaim_pages(&folio_list, true);
cond_resched();
return 0;
diff --git a/mm/memblock.c b/mm/memblock.c
index d09136e040..08e9806b1c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1339,6 +1339,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
int start_rgn, end_rgn;
int i, ret;
+ if (WARN_ONCE(nid == MAX_NUMNODES,
+ "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
if (ret)
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61932c9215..612558f306 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
@@ -3606,22 +3607,24 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
/*
* Because page_memcg(head) is not set on tails, set it now.
*/
-void split_page_memcg(struct page *head, unsigned int nr)
+void split_page_memcg(struct page *head, int old_order, int new_order)
{
struct folio *folio = page_folio(head);
struct mem_cgroup *memcg = folio_memcg(folio);
int i;
+ unsigned int old_nr = 1 << old_order;
+ unsigned int new_nr = 1 << new_order;
if (mem_cgroup_disabled() || !memcg)
return;
- for (i = 1; i < nr; i++)
+ for (i = new_nr; i < old_nr; i += new_nr)
folio_page(folio, i)->memcg_data = folio->memcg_data;
if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
+ obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
else
- css_get_many(&memcg->css, nr - 1);
+ css_get_many(&memcg->css, old_nr / new_nr - 1);
}
#ifdef CONFIG_SWAP
@@ -4800,7 +4803,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- mem_cgroup_flush_stats(memcg);
+ mem_cgroup_flush_stats_ratelimited(memcg);
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5621,7 +5624,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
- if (unlikely(mem_cgroup_is_root(memcg)))
+ if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
FLUSH_TIME);
lru_gen_online_memcg(memcg);
@@ -5873,7 +5876,7 @@ static int mem_cgroup_do_precharge(unsigned long count)
}
union mc_target {
- struct page *page;
+ struct folio *folio;
swp_entry_t ent;
};
@@ -5965,23 +5968,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
}
/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
+ * mem_cgroup_move_account - move account of the folio
+ * @folio: The folio.
* @compound: charge the page as compound or small page
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
+ * @from: mem_cgroup which the folio is moved from.
+ * @to: mem_cgroup which the folio is moved to. @from != @to.
*
- * The page must be locked and not on the LRU.
+ * The folio must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
-static int mem_cgroup_move_account(struct page *page,
+static int mem_cgroup_move_account(struct folio *folio,
bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
- struct folio *folio = page_folio(page);
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
@@ -6096,7 +6098,7 @@ out:
* Return:
* * MC_TARGET_NONE - If the pte is not a target for move charge.
* * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
- * move charge. If @target is not NULL, the page is stored in target->page
+ * move charge. If @target is not NULL, the folio is stored in target->folio
* with extra refcnt taken (Caller should release it).
* * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
* target for charge migration. If @target is not NULL, the entry is
@@ -6110,6 +6112,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
@@ -6124,9 +6127,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (page)
+ folio = page_folio(page);
if (target && page) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return ret;
}
/*
@@ -6141,8 +6146,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* Alas, skip moving the page in this case.
*/
if (!pte_present(ptent) && page_mapped(page)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
}
@@ -6155,18 +6160,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_coherent_page(page))
+ if (folio_is_device_private(folio) ||
+ folio_is_device_coherent(folio))
ret = MC_TARGET_DEVICE;
if (target)
- target->page = page;
+ target->folio = folio;
}
if (!ret || !target) {
if (target)
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
}
/*
@@ -6192,6 +6197,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
+ struct folio *folio;
enum mc_target_type ret = MC_TARGET_NONE;
if (unlikely(is_swap_pmd(pmd))) {
@@ -6201,17 +6207,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
}
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ folio = page_folio(page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page_memcg(page) == mc.from) {
+ if (folio_memcg(folio) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return MC_TARGET_NONE;
}
- target->page = page;
+ target->folio = folio;
}
}
return ret;
@@ -6431,7 +6438,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
spinlock_t *ptl;
enum mc_target_type target_type;
union mc_target target;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -6441,26 +6448,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
- page = target.page;
- if (isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (folio_isolate_lru(folio)) {
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- putback_lru_page(page);
+ folio_putback_lru(folio);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
} else if (target_type == MC_TARGET_DEVICE) {
- page = target.page;
- if (!mem_cgroup_move_account(page, true,
+ folio = target.folio;
+ if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
spin_unlock(ptl);
return 0;
@@ -6483,28 +6490,28 @@ retry:
device = true;
fallthrough;
case MC_TARGET_PAGE:
- page = target.page;
+ folio = target.folio;
/*
* We can have a part of the split pmd here. Moving it
* can be done but it would be too convoluted so simply
* ignore such a partial THP and keep it in original
* memcg. There should be somebody mapping the head.
*/
- if (PageTransCompound(page))
+ if (folio_test_large(folio))
goto put;
- if (!device && !isolate_lru_page(page))
+ if (!device && !folio_isolate_lru(folio))
goto put;
- if (!mem_cgroup_move_account(page, false,
+ if (!mem_cgroup_move_account(folio, false,
mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
if (!device)
- putback_lru_page(page);
+ folio_putback_lru(folio);
put: /* get_mctgt_type() gets & locks the page */
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
case MC_TARGET_SWAP:
ent = target.ent;
@@ -6977,6 +6984,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
+ /* Will converge on zero, but reclaim enforces a minimum */
+ unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4;
unsigned long reclaimed;
if (signal_pending(current))
@@ -6991,8 +7000,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX),
- GFP_KERNEL, reclaim_options);
+ batch_size, GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7505,21 +7513,14 @@ void __mem_cgroup_uncharge(struct folio *folio)
uncharge_batch(&ug);
}
-/**
- * __mem_cgroup_uncharge_list - uncharge a list of page
- * @page_list: list of pages to uncharge
- *
- * Uncharge a list of pages previously charged with
- * __mem_cgroup_charge().
- */
-void __mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
struct uncharge_gather ug;
- struct folio *folio;
+ unsigned int i;
uncharge_gather_clear(&ug);
- list_for_each_entry(folio, page_list, lru)
- uncharge_folio(folio, &ug);
+ for (i = 0; i < folios->nr; i++)
+ uncharge_folio(folios->folios[i], &ug);
if (ug.memcg)
uncharge_batch(&ug);
}
@@ -7530,8 +7531,7 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
* @new: Replacement folio.
*
* Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free. This is only used by the page cache
- * (in replace_page_cache_folio()).
+ * be uncharged upon free.
*
* Both folios must be locked, @new->mapping must be set up.
*/
diff --git a/mm/memfd.c b/mm/memfd.c
index d3a1ba4208..7d8d3ab3fa 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -29,29 +29,25 @@
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
+static bool memfd_folio_has_extra_refs(struct folio *folio)
+{
+ return folio_ref_count(folio) - folio_mapcount(folio) !=
+ folio_nr_pages(folio);
+}
+
static void memfd_tag_pins(struct xa_state *xas)
{
- struct page *page;
+ struct folio *folio;
int latency = 0;
- int cache_count;
lru_add_drain();
xas_lock_irq(xas);
- xas_for_each(xas, page, ULONG_MAX) {
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) &&
- page_count(page) - total_mapcount(page) != cache_count)
+ xas_for_each(xas, folio, ULONG_MAX) {
+ if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
- if (cache_count != 1)
- xas_set(xas, page->index + cache_count);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
@@ -66,16 +62,16 @@ static void memfd_tag_pins(struct xa_state *xas)
/*
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
* via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
* and see whether it has an elevated ref-count. If so, we tag them and wait for
* them to be dropped.
* The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
+ * to those folios to avoid races.
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
int error, scan;
memfd_tag_pins(&xas);
@@ -83,7 +79,6 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
- int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_set(&xas, 0);
xas_lock_irq(&xas);
- xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) && cache_count !=
- page_count(page) - total_mapcount(page)) {
+ if (!xa_is_value(folio) &&
+ memfd_folio_has_extra_refs(folio)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
- * found pages pinned.
+ * found folios pinned.
*/
if (scan == LAST_SCAN)
error = -EBUSY;
@@ -118,8 +108,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9e62a00b46..7751bd78fb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1218,7 +1218,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
* subpages.
*/
folio_put(folio);
- if (__page_handle_poison(p) >= 0) {
+ if (__page_handle_poison(p) > 0) {
page_ref_inc(p);
res = MF_RECOVERED;
} else {
@@ -2097,7 +2097,7 @@ retry:
*/
if (res == 0) {
folio_unlock(folio);
- if (__page_handle_poison(p) >= 0) {
+ if (__page_handle_poison(p) > 0) {
page_ref_inc(p);
res = MF_RECOVERED;
} else {
@@ -2550,6 +2550,13 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
+ if (is_huge_zero_page(&folio->page)) {
+ unpoison_pr_info("Unpoison: huge zero page is not supported %#lx\n",
+ pfn, &unpoison_rs);
+ ret = -EOPNOTSUPP;
+ goto unlock_mutex;
+ }
+
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 5462d9e3c8..0537664620 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -39,7 +39,7 @@ static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
-static struct bus_type memory_tier_subsys = {
+static const struct bus_type memory_tier_subsys = {
.name = "memory_tiering",
.dev_name = "memory_tier",
};
@@ -359,6 +359,26 @@ static void disable_all_demotion_targets(void)
synchronize_rcu();
}
+static void dump_demotion_targets(void)
+{
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ struct memory_tier *memtier = __node_get_memory_tier(node);
+ nodemask_t preferred = node_demotion[node].preferred;
+
+ if (!memtier)
+ continue;
+
+ if (nodes_empty(preferred))
+ pr_info("Demotion targets for Node %d: null\n", node);
+ else
+ pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n",
+ node, nodemask_pr_args(&preferred),
+ nodemask_pr_args(&memtier->lower_tier_mask));
+ }
+}
+
/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
@@ -443,7 +463,7 @@ static void establish_demotion_targets(void)
* Now build the lower_tier mask for each node collecting node mask from
* all memory tier below it. This allows us to fallback demotion page
* allocation to a set of nodes that is closer the above selected
- * perferred node.
+ * preferred node.
*/
lower_tier = node_states[N_MEMORY];
list_for_each_entry(memtier, &memory_tiers, list) {
@@ -456,6 +476,8 @@ static void establish_demotion_targets(void)
nodes_andnot(lower_tier, lower_tier, tier_nodes);
memtier->lower_tier_mask = lower_tier;
}
+
+ dump_demotion_targets();
}
#else
diff --git a/mm/memory.c b/mm/memory.c
index 296e4f05e8..d2155ced45 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ folio = pfn_swap_entry_folio(entry);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* keep things as they are.
*/
folio_get(folio);
- rss[mm_counter(page)]++;
+ rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
folio_try_dup_anon_rmap_pte(folio, page, src_vma);
@@ -930,68 +930,111 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
return 0;
}
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+ pte_t pte, unsigned long addr, int nr)
+{
+ struct mm_struct *src_mm = src_vma->vm_mm;
+
+ /* If it's a COW mapping, write protect it both processes. */
+ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+ wrprotect_ptes(src_mm, addr, src_pte, nr);
+ pte = pte_wrprotect(pte);
+ }
+
+ /* If it's a shared mapping, mark it clean in the child. */
+ if (src_vma->vm_flags & VM_SHARED)
+ pte = pte_mkclean(pte);
+ pte = pte_mkold(pte);
+
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_clear_uffd_wp(pte);
+
+ set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
/*
- * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
*/
static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct folio **prealloc)
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+ int max_nr, int *rss, struct folio **prealloc)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
- unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
+ bool any_writable;
+ fpb_t flags = 0;
+ int err, nr;
page = vm_normal_page(src_vma, addr, pte);
- if (page)
- folio = page_folio(page);
- if (page && folio_test_anon(folio)) {
+ if (unlikely(!page))
+ goto copy_pte;
+
+ folio = page_folio(page);
+
+ /*
+ * If we likely have to copy, just don't bother with batching. Make
+ * sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+ if (src_vma->vm_flags & VM_SHARED)
+ flags |= FPB_IGNORE_DIRTY;
+ if (!vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_IGNORE_SOFT_DIRTY;
+
+ nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+ &any_writable);
+ folio_ref_add(folio, nr);
+ if (folio_test_anon(folio)) {
+ if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+ nr, src_vma))) {
+ folio_ref_sub(folio, nr);
+ return -EAGAIN;
+ }
+ rss[MM_ANONPAGES] += nr;
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
+ folio_dup_file_rmap_ptes(folio, page, nr);
+ rss[mm_counter_file(folio)] += nr;
+ }
+ if (any_writable)
+ pte = pte_mkwrite(pte, src_vma);
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+ addr, nr);
+ return nr;
+ }
+
+ folio_get(folio);
+ if (folio_test_anon(folio)) {
/*
* If this page may have been pinned by the parent process,
* copy the page immediately for the child so that we'll always
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- folio_get(folio);
if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
- return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, page);
+ err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ return err ? err : 1;
}
rss[MM_ANONPAGES]++;
- } else if (page) {
- folio_get(folio);
+ VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+ } else {
folio_dup_file_rmap_pte(folio, page);
- rss[mm_counter_file(page)]++;
- }
-
- /*
- * If it's a COW mapping, write protect it both
- * in the parent and the child
- */
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
- ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = pte_wrprotect(pte);
+ rss[mm_counter_file(folio)]++;
}
- VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
- /*
- * If it's a shared mapping, mark it clean in
- * the child
- */
- if (vm_flags & VM_SHARED)
- pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
-
- if (!userfaultfd_wp(dst_vma))
- pte = pte_clear_uffd_wp(pte);
-
- set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
- return 0;
+copy_pte:
+ __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+ return 1;
}
static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1028,10 +1071,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *src_pte, *dst_pte;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
- int progress, ret = 0;
+ int progress, max_nr, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct folio *prealloc = NULL;
+ int nr;
again:
progress = 0;
@@ -1062,6 +1106,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
+ nr = 1;
+
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
@@ -1091,6 +1137,8 @@ again:
progress += 8;
continue;
}
+ ptent = ptep_get(src_pte);
+ VM_WARN_ON_ONCE(!pte_present(ptent));
/*
* Device exclusive entry restored, continue by copying
@@ -1098,9 +1146,10 @@ again:
*/
WARN_ON_ONCE(ret != -ENOENT);
}
- /* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, &prealloc);
+ /* copy_present_ptes() will clear `*prealloc' if consumed */
+ max_nr = (end - addr) / PAGE_SIZE;
+ ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+ ptent, addr, max_nr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1117,8 +1166,10 @@ again:
folio_put(prealloc);
prealloc = NULL;
}
- progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ nr = ret;
+ progress += 8 * nr;
+ } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+ addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1139,7 +1190,7 @@ again:
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
if (!prealloc)
return -ENOMEM;
- } else if (ret) {
+ } else if (ret < 0) {
VM_WARN_ON_ONCE(1);
}
@@ -1369,19 +1420,16 @@ static inline bool should_zap_cows(struct zap_details *details)
return details->even_cows;
}
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+ struct folio *folio)
{
- /* If we can make a decision without *page.. */
+ /* If we can make a decision without *folio.. */
if (should_zap_cows(details))
return true;
- /* E.g. the caller passes NULL for the case of a zero page */
- if (!page)
- return true;
-
- /* Otherwise we should only zap non-anon pages */
- return !PageAnon(page);
+ /* Otherwise we should only zap non-anon folios */
+ return !folio_test_anon(folio);
}
static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1398,7 +1446,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
*/
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte,
+ unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
/* Zap on anonymous always means dropping everything */
@@ -1408,7 +1456,113 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (zap_drop_file_uffd_wp(details))
return;
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ for (;;) {
+ /* the PFN in the PTE is irrelevant. */
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (--nr == 0)
+ break;
+ pte++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, struct folio *folio,
+ struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+ unsigned long addr, struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break)
+{
+ struct mm_struct *mm = tlb->mm;
+ bool delay_rmap = false;
+
+ if (!folio_test_anon(folio)) {
+ ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ if (pte_dirty(ptent)) {
+ folio_mark_dirty(folio);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = true;
+ *force_flush = true;
+ }
+ }
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
+ rss[mm_counter(folio)] -= nr;
+ } else {
+ /* We don't need up-to-date accessed/dirty bits. */
+ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ rss[MM_ANONPAGES] -= nr;
+ }
+ /* Checking a single PTE in a batch is sufficient. */
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
+ if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+ ptent);
+
+ if (!delay_rmap) {
+ folio_remove_rmap_ptes(folio, page, nr, vma);
+
+ /* Only sanity-check the first page in a batch. */
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
+ *force_flush = true;
+ *force_break = true;
+ }
+}
+
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *force_flush,
+ bool *force_break)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ struct mm_struct *mm = tlb->mm;
+ struct folio *folio;
+ struct page *page;
+ int nr;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page) {
+ /* We don't need up-to-date accessed/dirty bits. */
+ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ arch_check_zapped_pte(vma, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (userfaultfd_pte_wp(vma, ptent))
+ zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
+ details, ptent);
+ ksm_might_unmap_zero_page(mm, ptent);
+ return 1;
+ }
+
+ folio = page_folio(page);
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+
+ /*
+ * Make sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+ NULL);
+
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ addr, details, rss, force_flush,
+ force_break);
+ return nr;
+ }
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+ details, rss, force_flush, force_break);
+ return 1;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1416,13 +1570,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ int nr;
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
@@ -1436,7 +1591,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t ptent = ptep_get(pte);
struct folio *folio;
struct page *page;
+ int max_nr;
+ nr = 1;
if (pte_none(ptent))
continue;
@@ -1444,44 +1601,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- unsigned int delay_rmap;
-
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(!should_zap_page(details, page)))
- continue;
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- arch_check_zapped_pte(vma, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
- if (unlikely(!page)) {
- ksm_might_unmap_zero_page(mm, ptent);
- continue;
- }
-
- folio = page_folio(page);
- delay_rmap = 0;
- if (!folio_test_anon(folio)) {
- if (pte_dirty(ptent)) {
- folio_mark_dirty(folio);
- if (tlb_delay_rmap(tlb)) {
- delay_rmap = 1;
- force_flush = 1;
- }
- }
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
- folio_mark_accessed(folio);
- }
- rss[mm_counter(page)]--;
- if (!delay_rmap) {
- folio_remove_rmap_pte(folio, page, vma);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- }
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
- force_flush = 1;
- addr += PAGE_SIZE;
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+ addr, details, rss, &force_flush,
+ &force_break);
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
break;
}
continue;
@@ -1492,7 +1617,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
is_device_exclusive_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
folio = page_folio(page);
- if (unlikely(!should_zap_page(details, page)))
+ if (unlikely(!should_zap_folio(details, folio)))
continue;
/*
* Both device private/exclusive mappings should only
@@ -1501,7 +1626,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* see zap_install_uffd_wp_if_needed().
*/
WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
if (is_device_private_entry(entry))
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
@@ -1513,10 +1638,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
} else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- if (!should_zap_page(details, page))
+ folio = pfn_swap_entry_folio(entry);
+ if (!should_zap_folio(details, folio))
continue;
- rss[mm_counter(page)]--;
+ rss[mm_counter(folio)]--;
} else if (pte_marker_entry_uffd_wp(entry)) {
/*
* For anon: always drop the marker; for file: only
@@ -1535,8 +1660,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1870,7 +1995,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
folio_get(folio);
- inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -3081,7 +3206,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
return VM_FAULT_RETRY;
}
-static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3175,7 +3300,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+ dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
@@ -3253,7 +3378,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
folio_put(new_folio);
if (old_folio) {
if (page_copied)
- free_swap_cache(&old_folio->page);
+ free_swap_cache(old_folio);
folio_put(old_folio);
}
@@ -3376,6 +3501,16 @@ static bool wp_can_reuse_anon_folio(struct folio *folio,
struct vm_area_struct *vma)
{
/*
+ * We could currently only reuse a subpage of a large folio if no
+ * other subpages of the large folios are still mapped. However,
+ * let's just consistently not reuse subpages even if we could
+ * reuse in that scenario, and give back a large folio a bit
+ * sooner.
+ */
+ if (folio_test_large(folio))
+ return false;
+
+ /*
* We have to verify under folio lock: these early checks are
* just an optimization to avoid locking the folio and freeing
* the swapcache if there is little hope that we can reuse.
@@ -4170,8 +4305,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long orders;
struct folio *folio;
unsigned long addr;
@@ -4223,15 +4358,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
folio = vma_alloc_folio(gfp, order, vma, addr, true);
if (folio) {
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
+ goto next;
+ }
+ folio_throttle_swaprate(folio, gfp);
clear_huge_page(&folio->page, vmf->address, 1 << order);
return folio;
}
+next:
order = next_order(&orders, order);
}
fallback:
#endif
- return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+ return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}
/*
@@ -4298,10 +4439,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
nr_pages = folio_nr_pages(folio);
addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
- if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
- goto oom_free_page;
- folio_throttle_swaprate(folio, GFP_KERNEL);
-
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
@@ -4355,8 +4492,6 @@ unlock:
release:
folio_put(folio);
goto unlock;
-oom_free_page:
- folio_put(folio);
oom:
return VM_FAULT_OOM;
}
@@ -4480,7 +4615,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
folio_add_file_rmap_pmd(folio, page, vma);
/*
@@ -4543,7 +4678,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
} else {
- add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
folio_add_file_rmap_ptes(folio, page, nr, vma);
}
set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -4653,7 +4788,8 @@ static int fault_around_bytes_set(void *data, u64 val)
* The minimum value is 1 page, however this results in no fault-around
* at all. See should_fault_around().
*/
- fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+ val = max(val, PAGE_SIZE);
+ fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
return 0;
}
@@ -4928,18 +5064,18 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
int flags = 0;
/*
- * The "pte" at this point cannot be used safely without
- * validation through pte_unmap_same(). It's of NUMA type but
- * the pfn may be screwed if the read is non atomic.
+ * The pte cannot be used safely until we verify, while holding the page
+ * table lock, that its contents have not changed during fault handling.
*/
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ /* Read the live PTE from the page tables: */
+ old_pte = ptep_get(vmf->pte);
+
+ if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
- /* Get the normal PTE */
- old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
/*
@@ -6167,7 +6303,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
- clear_user_highpage(page + idx, addr);
+ clear_user_highpage(nth_page(page, idx), addr);
return 0;
}
@@ -6217,10 +6353,11 @@ struct copy_subpage_arg {
static int copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
+ struct page *dst = nth_page(copy_arg->dst, idx);
+ struct page *src = nth_page(copy_arg->src, idx);
- if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
- addr, copy_arg->vma)) {
- memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+ if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
return -EHWPOISON;
}
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 21890994c1..a444e2d7dd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone)
+ struct zone *zone, bool mhp_off_inaccessible)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
+ /*
+ * Memory block is accessible at this stage and hence poison the struct
+ * pages now. If the memory block is accessible during memory hotplug
+ * addition phase, then page poisining is already performed in
+ * sparse_add_section().
+ */
+ if (mhp_off_inaccessible)
+ page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
+
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
for (i = 0; i < nr_pages; i++)
@@ -1328,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
}
#endif
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
{
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1337,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* Besides having arch support and the feature enabled at runtime, we
* need a few more assumptions to hold true:
*
- * a) We span a single memory block: memory onlining/offlinin;g happens
- * in memory block granularity. We don't want the vmemmap of online
- * memory blocks to reside on offline memory blocks. In the future,
- * we might want to support variable-sized memory blocks to make the
- * feature more versatile.
- *
- * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+ * a) The vmemmap pages span complete PMDs: We don't want vmemmap code
* to populate memory from the altmap for unrelated parts (i.e.,
* other memory blocks)
*
- * c) The vmemmap pages (and thereby the pages that will be exposed to
+ * b) The vmemmap pages (and thereby the pages that will be exposed to
* the buddy) have to cover full pageblocks: memory onlining/offlining
* code requires applicable ranges to be page-aligned, for example, to
* set the migratetypes properly.
@@ -1359,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+ if (!mhp_memmap_on_memory())
return false;
/*
@@ -1382,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size)
return arch_supports_memmap_on_memory(vmemmap_size);
}
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
@@ -1415,7 +1419,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size)
+ u64 start, u64 size, mhp_t mhp_flags)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1431,6 +1435,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
+ if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
+ mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1515,8 +1521,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* Self hosted memmap array
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
- mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size);
+ mhp_supports_memmap_on_memory()) {
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
if (ret)
goto error;
} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c..0fe77738d9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
* for anonymous memory. For process policy an process counter
* is used.
*
+ * weighted interleave
+ * Allocate memory interleaved over a set of nodes based on
+ * a set of weights (per-node), with normal fallback if it
+ * fails. Otherwise operates the same as interleave.
+ * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ * on node 0 for every 1 page allocated on node 1.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
@@ -131,6 +138,32 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
+static u8 get_il_weight(int node)
+{
+ u8 *table;
+ u8 weight;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* if no iw_table, use system default */
+ weight = table ? table[node] : 1;
+ /* if value in iw_table is 0, use system default */
+ weight = weight ? weight : 1;
+ rcu_read_unlock();
+ return weight;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -415,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
};
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
@@ -654,7 +691,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
{
struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
- unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
@@ -682,9 +718,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
!(flags & MPOL_MF_STRICT))
return 1;
- if (endvma > end)
- endvma = end;
-
/*
* Check page nodes, and queue pages to move, in the current vma.
* But if no moving, and no strict checking, the scan can be skipped.
@@ -836,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_weight = 0;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -862,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
*nodes = pol->nodes;
break;
case MPOL_LOCAL:
@@ -946,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ if (current->il_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1310,30 +1354,32 @@ static long do_mbind(unsigned long start, unsigned long len,
* VMAs, the nodes will still be interleaved from the targeted
* nodemask, but one by one may be selected differently.
*/
- if (new->mode == MPOL_INTERLEAVE) {
- struct page *page;
+ if (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ struct folio *folio;
unsigned int order;
unsigned long addr = -EFAULT;
- list_for_each_entry(page, &pagelist, lru) {
- if (!PageKsm(page))
+ list_for_each_entry(folio, &pagelist, lru) {
+ if (!folio_test_ksm(folio))
break;
}
- if (!list_entry_is_head(page, &pagelist, lru)) {
+ if (!list_entry_is_head(folio, &pagelist, lru)) {
vma_iter_init(&vmi, mm, start);
for_each_vma_range(vmi, vma, end) {
- addr = page_address_in_vma(page, vma);
+ addr = page_address_in_vma(
+ folio_page(folio, 0), vma);
if (addr != -EFAULT)
break;
}
}
if (addr != -EFAULT) {
- order = compound_order(page);
+ order = folio_order(folio);
/* We already know the pol, but not the ilx */
mpol_cond_put(get_vma_policy(vma, addr, order,
&mmpol.ilx));
/* Set base from which to increment by index */
- mmpol.ilx -= page->index >> order;
+ mmpol.ilx -= folio->index >> order;
}
}
}
@@ -1758,7 +1804,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
* @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ * MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
@@ -1775,7 +1822,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
@@ -1825,12 +1873,40 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
return zone >= dynamic_policy_zone;
}
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int node;
+ unsigned int cpuset_mems_cookie;
+
+retry:
+ /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ node = current->il_prev;
+ if (!current->il_weight || !node_isset(node, policy->nodes)) {
+ node = next_node_in(node, policy->nodes);
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry;
+ if (node == MAX_NUMNODES)
+ return node;
+ current->il_prev = node;
+ current->il_weight = get_il_weight(node);
+ }
+ current->il_weight--;
+ return node;
+}
+
/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
unsigned int nid;
+ unsigned int cpuset_mems_cookie;
+
+ /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nid = next_node_in(current->il_prev, policy->nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
- nid = next_node_in(current->il_prev, policy->nodes);
if (nid < MAX_NUMNODES)
current->il_prev = nid;
return nid;
@@ -1859,6 +1935,9 @@ unsigned int mempolicy_slab_node(void)
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
@@ -1883,6 +1962,59 @@ unsigned int mempolicy_slab_node(void)
}
}
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+ nodemask_t *mask)
+{
+ /*
+ * barrier stabilizes the nodemask locally so that it can be iterated
+ * over safely without concern for changes. Allocators validate node
+ * selection does not violate mems_allowed, so this is safe.
+ */
+ barrier();
+ memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+ barrier();
+ return nodes_weight(*mask);
+}
+
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ nodemask_t nodemask;
+ unsigned int target, nr_nodes;
+ u8 *table;
+ unsigned int weight_total = 0;
+ u8 weight;
+ int nid;
+
+ nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nr_nodes)
+ return numa_node_id();
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ /* calculate the total weight */
+ for_each_node_mask(nid, nodemask) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ weight_total += weight;
+ }
+
+ /* Calculate the node offset based on totals */
+ target = ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ weight = weight ? weight : 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ rcu_read_unlock();
+ return nid;
+}
+
/*
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
@@ -1890,20 +2022,12 @@ unsigned int mempolicy_slab_node(void)
*/
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
- nodemask_t nodemask = pol->nodes;
+ nodemask_t nodemask;
unsigned int target, nnodes;
int i;
int nid;
- /*
- * The barrier will stabilize the nodemask in a register or on
- * the stack so that it will stop changing under the code.
- *
- * Between first_node() and next_node(), pol->nodes could be changed
- * by other threads. So we put pol->nodes in a local stack.
- */
- barrier();
- nnodes = nodes_weight(nodemask);
+ nnodes = read_once_policy_nodemask(pol, &nodemask);
if (!nnodes)
return numa_node_id();
target = ilx % nnodes;
@@ -1951,6 +2075,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
}
return nodemask;
@@ -2012,6 +2141,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;
@@ -2112,6 +2242,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
(!nodemask || node_isset(nid, *nodemask))) {
/*
* First, try to allocate THP only on local node, but
@@ -2247,6 +2378,121 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned int cpuset_mems_cookie;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated = 0;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ u8 *table, *weights, weight;
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes;
+ int nnodes, node;
+ int resume_node = MAX_NUMNODES - 1;
+ u8 resume_weight = 0;
+ int prev_node;
+ int i;
+
+ if (!nr_pages)
+ return 0;
+
+ /* read the nodes onto the stack, retry if done during rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nnodes = read_once_policy_nodemask(pol, &nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ /* if the nodemask has become invalid, we cannot do anything */
+ if (!nnodes)
+ return 0;
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ node = me->il_prev;
+ weight = me->il_weight;
+ if (weight && node_isset(node, nodes)) {
+ node_pages = min(rem_pages, weight);
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= weight) {
+ me->il_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust remaining pages, continue from there */
+ rem_pages -= weight;
+ }
+ /* clear active weight in case of an allocation failure */
+ me->il_weight = 0;
+ prev_node = node;
+
+ /* create a local copy of node weights to operate on outside rcu */
+ weights = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!weights)
+ return total_allocated;
+
+ rcu_read_lock();
+ table = rcu_dereference(iw_table);
+ if (table)
+ memcpy(weights, table, nr_node_ids);
+ rcu_read_unlock();
+
+ /* calculate total, detect system default usage */
+ for_each_node_mask(node, nodes) {
+ if (!weights[node])
+ weights[node] = 1;
+ weight_total += weights[node];
+ }
+
+ /*
+ * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+ * Track which node weighted interleave should resume from.
+ *
+ * if (rounds > 0) and (delta == 0), resume_node will always be
+ * the node following prev_node and its weight.
+ */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ resume_node = next_node_in(prev_node, nodes);
+ resume_weight = weights[resume_node];
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ /* If a delta exists, add this node's portion of the delta */
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else if (delta) {
+ /* when delta is depleted, resume from that node */
+ node_pages += delta;
+ resume_node = node;
+ resume_weight = weight - delta;
+ delta = 0;
+ }
+ /* node_pages can be 0 if an allocation fails and rounds == 0 */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ if (total_allocated == nr_pages)
+ break;
+ prev_node = node;
+ }
+ me->il_prev = resume_node;
+ me->il_weight = resume_weight;
+ kfree(weights);
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2287,6 +2533,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(
+ gfp, pol, nr_pages, page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2362,6 +2612,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
@@ -2498,6 +2749,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
polnid = interleave_nid(pol, ilx);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
+ break;
+
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
@@ -2872,6 +3127,7 @@ static const char * const policy_modes[] =
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -2931,6 +3187,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -3041,6 +3298,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
@@ -3067,3 +3325,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+ struct kobj_attribute kobj_attr;
+ int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ weight = get_il_weight(node_attr->nid);
+ return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ u8 *new;
+ u8 *old;
+ u8 weight = 0;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ if (count == 0 || sysfs_streq(buf, ""))
+ weight = 0;
+ else if (kstrtou8(buf, 0, &weight))
+ return -EINVAL;
+
+ new = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ if (old)
+ memcpy(new, old, nr_node_ids);
+ new[node_attr->nid] = weight;
+ rcu_assign_pointer(iw_table, new);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+ struct kobject *parent)
+{
+ if (!node_attr)
+ return;
+ sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ sysfs_wi_node_release(node_attrs[i], wi_kobj);
+ kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+ struct iw_node_attr *node_attr;
+ char *name;
+
+ node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+ if (!node_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "node%d", nid);
+ if (!name) {
+ kfree(node_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&node_attr->kobj_attr.attr);
+ node_attr->kobj_attr.attr.name = name;
+ node_attr->kobj_attr.attr.mode = 0644;
+ node_attr->kobj_attr.show = node_show;
+ node_attr->kobj_attr.store = node_store;
+ node_attr->nid = nid;
+
+ if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+ pr_err("failed to add attribute to weighted_interleave\n");
+ return -ENOMEM;
+ }
+
+ node_attrs[nid] = node_attr;
+ return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+ struct kobject *wi_kobj;
+ int nid, err;
+
+ wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!wi_kobj)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ "weighted_interleave");
+ if (err) {
+ kfree(wi_kobj);
+ return err;
+ }
+
+ for_each_node_state(nid, N_POSSIBLE) {
+ err = add_weight_node(nid, wi_kobj);
+ if (err) {
+ pr_err("failed to add sysfs [node%d]\n", nid);
+ break;
+ }
+ }
+ if (err)
+ kobject_put(wi_kobj);
+ return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+ u8 *old;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ rcu_assign_pointer(iw_table, NULL);
+ mutex_unlock(&iw_table_lock);
+ synchronize_rcu();
+ kfree(old);
+ kfree(node_attrs);
+ kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+ .release = mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ static struct kobject *mempolicy_kobj;
+
+ mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+ if (!mempolicy_kobj) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+ GFP_KERNEL);
+ if (!node_attrs) {
+ err = -ENOMEM;
+ goto mempol_out;
+ }
+
+ err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+ "mempolicy");
+ if (err)
+ goto node_out;
+
+ err = add_weighted_interleave_group(mempolicy_kobj);
+ if (err) {
+ pr_err("mempolicy sysfs structure failed to initialize\n");
+ kobject_put(mempolicy_kobj);
+ return err;
+ }
+
+ return err;
+node_out:
+ kfree(node_attrs);
+mempol_out:
+ kfree(mempolicy_kobj);
+err_out:
+ pr_err("failed to add mempolicy kobject to the system\n");
+ return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/mm/mempool.c b/mm/mempool.c
index dbbf0e9fb4..076c736f5f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -590,6 +590,19 @@ void mempool_kfree(void *element, void *pool_data)
}
EXPORT_SYMBOL(mempool_kfree);
+void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ return kvmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kvmalloc);
+
+void mempool_kvfree(void *element, void *pool_data)
+{
+ kvfree(element);
+}
+EXPORT_SYMBOL(mempool_kvfree);
+
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.
diff --git a/mm/migrate.c b/mm/migrate.c
index c27b1f8097..73a052a382 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
old_pte = ptep_get(pvmw.pte);
- if (pte_swp_soft_dirty(old_pte))
- pte = pte_mksoft_dirty(pte);
entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
+ if (pte_swp_soft_dirty(old_pte))
+ pte = pte_mksoft_dirty(pte);
+ else
+ pte = pte_clear_soft_dirty(pte);
+
if (is_writable_migration_entry(entry))
pte = pte_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(old_pte))
diff --git a/mm/mlock.c b/mm/mlock.c
index 086546ac57..1ed2f2ab37 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
if (lruvec)
unlock_page_lruvec_irq(lruvec);
- folios_put(fbatch->folios, folio_batch_count(fbatch));
- folio_batch_reinit(fbatch);
+ folios_put(fbatch);
}
void mlock_drain_local(void)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2c19f5515e..549e76af8f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data)
.align = PAGES_PER_SECTION,
.min_chunk = PAGES_PER_SECTION,
.max_threads = max_threads,
+ .numa_aware = false,
};
padata_do_multithreaded(&job);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3281287771..6dbda99a47 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -64,7 +64,7 @@
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
-const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
@@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
* Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
- struct file *file, struct address_space *mapping)
+ struct address_space *mapping)
{
if (vma_is_shared_maywrite(vma))
mapping_unmap_writable(mapping);
@@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
i_mmap_lock_write(mapping);
- __remove_shared_vm_struct(vma, file, mapping);
+ __remove_shared_vm_struct(vma, mapping);
i_mmap_unlock_write(mapping);
}
}
@@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma,
flush_dcache_mmap_unlock(mapping);
}
+static void vma_link_file(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping;
+
+ if (file) {
+ mapping = file->f_mapping;
+ i_mmap_lock_write(mapping);
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, mm, 0);
- struct address_space *mapping = NULL;
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
vma_start_write(vma);
-
vma_iter_store(&vmi, vma);
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- __vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-
+ vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
return 0;
@@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp,
}
if (vp->remove && vp->file) {
- __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping);
+ __remove_shared_vm_struct(vp->remove, vp->mapping);
if (vp->remove2)
- __remove_shared_vm_struct(vp->remove2, vp->file,
- vp->mapping);
+ __remove_shared_vm_struct(vp->remove2, vp->mapping);
} else if (vp->insert) {
/*
* split_vma has split insert from vma, and needs
@@ -660,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_prepare(&vp);
vma_adjust_trans_huge(vma, start, end, 0);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
@@ -705,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_adjust_trans_huge(vma, start, end, 0);
vma_iter_clear(vmi);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_pgoff = pgoff;
+ vma_set_range(vma, start, end, pgoff);
vma_complete(&vp, vmi, vma->vm_mm);
return 0;
}
@@ -861,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* area is returned, or the function will return NULL
*/
static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
- struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy,
+*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *src, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
+ struct mm_struct *mm = src->vm_mm;
+ struct anon_vma *anon_vma = src->anon_vma;
+ struct file *file = src->vm_file;
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
struct vm_area_struct *anon_dup = NULL;
@@ -1020,10 +1021,7 @@ static struct vm_area_struct
vma_prepare(&vp);
vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
-
- vma->vm_start = vma_start;
- vma->vm_end = vma_end;
- vma->vm_pgoff = vma_pgoff;
+ vma_set_range(vma, vma_start, vma_end, vma_pgoff);
if (vma_expanded)
vma_iter_store(vmi, vma);
@@ -2056,7 +2054,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2150,7 +2147,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma(vma, vma->vm_flags);
mas_destroy(&mas);
validate_mm(mm);
return error;
@@ -2440,9 +2436,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
struct vm_area_struct *merged;
- merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, policy,
- uffd_ctx, anon_name);
+ merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
+ pgoff, policy, uffd_ctx, anon_name);
if (merged)
return merged;
@@ -2472,9 +2467,8 @@ static struct vm_area_struct
struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff)
{
- return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2488,10 +2482,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
/* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta,
- vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
+ return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
+ vma->vm_flags, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}
/*
@@ -2818,11 +2811,9 @@ cannot_expand:
}
vma_iter_config(&vmi, addr, end);
- vma->vm_start = addr;
- vma->vm_end = end;
+ vma_set_range(vma, addr, end, pgoff);
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
if (file) {
vma->vm_file = get_file(file);
@@ -2899,16 +2890,7 @@ cannot_expand:
vma_start_write(vma);
vma_iter_store(&vmi, vma);
mm->map_count++;
- if (vma->vm_file) {
- i_mmap_lock_write(vma->vm_file->f_mapping);
- if (vma_is_shared_maywrite(vma))
- mapping_allow_writable(vma->vm_file->f_mapping);
-
- flush_dcache_mmap_lock(vma->vm_file->f_mapping);
- vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
- flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- }
+ vma_link_file(vma);
/*
* vma_merge() calls khugepaged_enter_vma() either, the below
@@ -3181,9 +3163,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto unacct_fail;
vma_set_anonymous(vma);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_pgoff = addr >> PAGE_SHIFT;
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(flags);
vma_start_write(vma);
@@ -3420,9 +3400,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma = vm_area_dup(vma);
if (!new_vma)
goto out;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
+ vma_set_range(new_vma, addr, addr + len, pgoff);
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
if (anon_vma_clone(new_vma, vma))
@@ -3590,9 +3568,7 @@ static struct vm_area_struct *__install_special_mapping(
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- vma->vm_start = addr;
- vma->vm_end = addr + len;
-
+ vma_set_range(vma, addr, addr + len, 0);
vm_flags_init(vma, (vm_flags | mm->def_flags |
VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
@@ -3876,7 +3852,7 @@ static int init_user_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
return 0;
}
subsys_initcall(init_user_reserve);
@@ -3897,7 +3873,7 @@ static int init_admin_reserve(void)
free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
- sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
return 0;
}
subsys_initcall(init_admin_reserve);
@@ -3929,12 +3905,12 @@ static int reserve_mem_notifier(struct notifier_block *nb,
case MEM_ONLINE:
/* Default max is 128MB. Leave alone if modified by operator. */
tmp = sysctl_user_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 17))
+ if (tmp > 0 && tmp < SZ_128K)
init_user_reserve();
/* Default max is 8MB. Leave alone if modified by operator. */
tmp = sysctl_admin_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 13))
+ if (tmp > 0 && tmp < SZ_8K)
init_admin_reserve();
break;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 604ddf08af..99b3e9408a 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
+ struct encoded_page **pages = batch->encoded_pages;
+
for (int i = 0; i < batch->nr; i++) {
- struct encoded_page *enc = batch->encoded_pages[i];
+ struct encoded_page *enc = pages[i];
- if (encoded_page_flags(enc)) {
+ if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
struct page *page = encoded_page_ptr(enc);
- folio_remove_rmap_pte(page_folio(page), page, vma);
+ unsigned int nr_pages = 1;
+
+ if (unlikely(encoded_page_flags(enc) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages = encoded_nr_pages(pages[++i]);
+
+ folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
+ vma);
}
}
}
@@ -82,26 +91,62 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
}
#endif
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE 512
+
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
- struct mmu_gather_batch *batch;
+ struct encoded_page **pages = batch->encoded_pages;
+ unsigned int nr, nr_pages;
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- struct encoded_page **pages = batch->encoded_pages;
+ while (batch->nr) {
+ if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+ nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
- do {
/*
- * limit free batch count when PAGE_SIZE > 4K
+ * Make sure we cover page + nr_pages, and don't leave
+ * nr_pages behind when capping the number of entries.
+ */
+ if (unlikely(encoded_page_flags(pages[nr - 1]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr++;
+ } else {
+ /*
+ * With page poisoning and init_on_free, the time it
+ * takes to free memory grows proportionally with the
+ * actual memory size. Therefore, limit based on the
+ * actual memory size and not the number of involved
+ * folios.
*/
- unsigned int nr = min(512U, batch->nr);
+ for (nr = 0, nr_pages = 0;
+ nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
+ nr++) {
+ if (unlikely(encoded_page_flags(pages[nr]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ nr_pages += encoded_nr_pages(pages[++nr]);
+ else
+ nr_pages++;
+ }
+ }
- free_pages_and_swap_cache(pages, nr);
- pages += nr;
- batch->nr -= nr;
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
- cond_resched();
- } while (batch->nr);
+ cond_resched();
}
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
+ __tlb_batch_free_encoded_pages(batch);
tlb->active = &tlb->local;
}
@@ -116,14 +161,19 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
tlb->local.next = NULL;
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size)
+static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
+ struct page *page, unsigned int nr_pages, bool delay_rmap,
+ int page_size)
{
+ int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
+ VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
+ VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif
batch = tlb->active;
@@ -131,17 +181,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i
* Add the page and check if we are full. If so
* force a flush.
*/
- batch->encoded_pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
+ if (likely(nr_pages == 1)) {
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ } else {
+ flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
+ batch->encoded_pages[batch->nr++] = encode_page(page, flags);
+ batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
+ }
+ /*
+ * Make sure that we can always add another "page" + "nr_pages",
+ * requiring two entries instead of only a single one.
+ */
+ if (batch->nr >= batch->max - 1) {
if (!tlb_next_batch(tlb))
return true;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page));
+ VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
return false;
}
+bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
+ unsigned int nr_pages, bool delay_rmap)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
+ PAGE_SIZE);
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
+ bool delay_rmap, int page_size)
+{
+ return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+}
+
#endif /* MMU_GATHER_NO_GATHER */
#ifdef CONFIG_MMU_GATHER_TABLE_FREE
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f7..f8a4544b46 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t newpte;
if (is_writable_migration_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
/*
* A protection check is difficult so
* just be safe and disable write
*/
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
diff --git a/mm/nommu.c b/mm/nommu.c
index b6dc558d31..5ec8f44e7c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL(follow_pfn);
-LIST_HEAD(vmap_area_list);
-
void vfree(const void *addr)
{
kfree(addr);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91ccd82097..8d6a207c3c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/kthread.h>
#include <linux/init.h>
#include <linux/mmu_notifier.h>
+#include <linux/cred.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
*/
static void mark_oom_victim(struct task_struct *tsk)
{
+ const struct cred *cred;
struct mm_struct *mm = tsk->mm;
WARN_ON(oom_killer_disabled);
@@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk)
*/
__thaw_task(tsk);
atomic_inc(&oom_victims);
- trace_mark_victim(tsk->pid);
+ cred = get_task_cred(tsk);
+ trace_mark_victim(tsk, cred->uid.val);
+ put_cred(cred);
}
/**
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f25553498..3e19b87049 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2325,18 +2325,18 @@ void __init page_writeback_init(void)
}
/**
- * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * tag_pages_for_writeback - tag pages to be written by writeback
* @mapping: address space structure to write
* @start: starting page index
* @end: ending page index (inclusive)
*
* This function scans the page range from @start to @end (inclusive) and tags
- * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
- * that write_cache_pages (or whoever calls this function) will then use
- * TOWRITE tag to identify pages eligible for writeback. This mechanism is
- * used to avoid livelocking of writeback by a process steadily creating new
- * dirty pages in the file (thus it is important for this function to be quick
- * so that it can tag pages faster than a dirtying process can create them).
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The caller
+ * can then use the TOWRITE tag to identify pages eligible for writeback.
+ * This mechanism is used to avoid livelocking of writeback by a process
+ * steadily creating new dirty pages in the file (thus it is important for this
+ * function to be quick so that it can tag pages faster than a dirtying process
+ * can create them).
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -2360,183 +2360,242 @@ void tag_pages_for_writeback(struct address_space *mapping,
}
EXPORT_SYMBOL(tag_pages_for_writeback);
+static bool folio_prepare_writeback(struct address_space *mapping,
+ struct writeback_control *wbc, struct folio *folio)
+{
+ /*
+ * Folio truncated or invalidated. We can freely skip it then,
+ * even for data integrity operations: the folio has disappeared
+ * concurrently, so there could be no real expectation of this
+ * data integrity operation even if there is now a new, dirty
+ * folio at the same pagecache index.
+ */
+ if (unlikely(folio->mapping != mapping))
+ return false;
+
+ /*
+ * Did somebody else write it for us?
+ */
+ if (!folio_test_dirty(folio))
+ return false;
+
+ if (folio_test_writeback(folio)) {
+ if (wbc->sync_mode == WB_SYNC_NONE)
+ return false;
+ folio_wait_writeback(folio);
+ }
+ BUG_ON(folio_test_writeback(folio));
+
+ if (!folio_clear_dirty_for_io(folio))
+ return false;
+
+ return true;
+}
+
+static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
+{
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ return PAGECACHE_TAG_TOWRITE;
+ return PAGECACHE_TAG_DIRTY;
+}
+
+static pgoff_t wbc_end(struct writeback_control *wbc)
+{
+ if (wbc->range_cyclic)
+ return -1;
+ return wbc->range_end >> PAGE_SHIFT;
+}
+
+static struct folio *writeback_get_folio(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct folio *folio;
+
+retry:
+ folio = folio_batch_next(&wbc->fbatch);
+ if (!folio) {
+ folio_batch_release(&wbc->fbatch);
+ cond_resched();
+ filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc),
+ wbc_to_tag(wbc), &wbc->fbatch);
+ folio = folio_batch_next(&wbc->fbatch);
+ if (!folio)
+ return NULL;
+ }
+
+ folio_lock(folio);
+ if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) {
+ folio_unlock(folio);
+ goto retry;
+ }
+
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
+ return folio;
+}
+
/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * writeback_iter - iterate folio of a mapping for writeback
* @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
+ * @wbc: writeback context
+ * @folio: previously iterated folio (%NULL to start)
+ * @error: in-out pointer for writeback errors (see below)
*
- * If a page is already under I/O, write_cache_pages() skips it, even
- * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
- * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them. If wbc->sync_mode is
- * WB_SYNC_ALL then we were called for data integrity and we must wait for
- * existing IO to complete.
- *
- * To avoid livelocks (when other process dirties new pages), we first tag
- * pages which should be written back with TOWRITE tag and only then start
- * writing them. For data-integrity sync we have to be careful so that we do
- * not miss some pages (e.g., because some other process has cleared TOWRITE
- * tag we set). The rule we follow is that TOWRITE tag can be cleared only
- * by the process clearing the DIRTY tag (and submitting the page for IO).
- *
- * To avoid deadlocks between range_cyclic writeback and callers that hold
- * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
- * we do not loop back to the start of the file. Doing so causes a page
- * lock/page writeback access order inversion - we should only ever lock
- * multiple pages in ascending page->index order, and looping back to the start
- * of the file violates that rule and causes deadlocks.
+ * This function returns the next folio for the writeback operation described by
+ * @wbc on @mapping and should be called in a while loop in the ->writepages
+ * implementation.
*
- * Return: %0 on success, negative error code otherwise
+ * To start the writeback operation, %NULL is passed in the @folio argument, and
+ * for every subsequent iteration the folio returned previously should be passed
+ * back in.
+ *
+ * If there was an error in the per-folio writeback inside the writeback_iter()
+ * loop, @error should be set to the error value.
+ *
+ * Once the writeback described in @wbc has finished, this function will return
+ * %NULL and if there was an error in any iteration restore it to @error.
+ *
+ * Note: callers should not manually break out of the loop using break or goto
+ * but must keep calling writeback_iter() until it returns %NULL.
+ *
+ * Return: the folio to write or %NULL if the loop is done.
*/
-int write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, writepage_t writepage,
- void *data)
+struct folio *writeback_iter(struct address_space *mapping,
+ struct writeback_control *wbc, struct folio *folio, int *error)
{
- int ret = 0;
- int done = 0;
- int error;
- struct folio_batch fbatch;
- int nr_folios;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
- pgoff_t done_index;
- int range_whole = 0;
- xa_mark_t tag;
-
- folio_batch_init(&fbatch);
- if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* prev offset */
- end = -1;
- } else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- }
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- tag_pages_for_writeback(mapping, index, end);
- tag = PAGECACHE_TAG_TOWRITE;
- } else {
- tag = PAGECACHE_TAG_DIRTY;
- }
- done_index = index;
- while (!done && (index <= end)) {
- int i;
-
- nr_folios = filemap_get_folios_tag(mapping, &index, end,
- tag, &fbatch);
-
- if (nr_folios == 0)
- break;
+ if (!folio) {
+ folio_batch_init(&wbc->fbatch);
+ wbc->saved_err = *error = 0;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = fbatch.folios[i];
- unsigned long nr;
+ /*
+ * For range cyclic writeback we remember where we stopped so
+ * that we can continue where we stopped.
+ *
+ * For non-cyclic writeback we always start at the beginning of
+ * the passed in range.
+ */
+ if (wbc->range_cyclic)
+ wbc->index = mapping->writeback_index;
+ else
+ wbc->index = wbc->range_start >> PAGE_SHIFT;
- done_index = folio->index;
+ /*
+ * To avoid livelocks when other processes dirty new pages, we
+ * first tag pages which should be written back and only then
+ * start writing them.
+ *
+ * For data-integrity writeback we have to be careful so that we
+ * do not miss some pages (e.g., because some other process has
+ * cleared the TOWRITE tag we set). The rule we follow is that
+ * TOWRITE tag can be cleared only by the process clearing the
+ * DIRTY tag (and submitting the page for I/O).
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, wbc->index,
+ wbc_end(wbc));
+ } else {
+ wbc->nr_to_write -= folio_nr_pages(folio);
- folio_lock(folio);
+ WARN_ON_ONCE(*error > 0);
- /*
- * Page truncated or invalidated. We can freely skip it
- * then, even for data integrity operations: the page
- * has disappeared concurrently, so there could be no
- * real expectation of this data integrity operation
- * even if there is now a new, dirty page at the same
- * pagecache address.
- */
- if (unlikely(folio->mapping != mapping)) {
-continue_unlock:
- folio_unlock(folio);
- continue;
- }
+ /*
+ * For integrity writeback we have to keep going until we have
+ * written all the folios we tagged for writeback above, even if
+ * we run past wbc->nr_to_write or encounter errors.
+ * We stash away the first error we encounter in wbc->saved_err
+ * so that it can be retrieved when we're done. This is because
+ * the file system may still have state to clear for each folio.
+ *
+ * For background writeback we exit as soon as we run past
+ * wbc->nr_to_write or encounter the first error.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (*error && !wbc->saved_err)
+ wbc->saved_err = *error;
+ } else {
+ if (*error || wbc->nr_to_write <= 0)
+ goto done;
+ }
+ }
- if (!folio_test_dirty(folio)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
+ folio = writeback_get_folio(mapping, wbc);
+ if (!folio) {
+ /*
+ * To avoid deadlocks between range_cyclic writeback and callers
+ * that hold pages in PageWriteback to aggregate I/O until
+ * the writeback iteration finishes, we do not loop back to the
+ * start of the file. Doing so causes a page lock/page
+ * writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping
+ * back to the start of the file violates that rule and causes
+ * deadlocks.
+ */
+ if (wbc->range_cyclic)
+ mapping->writeback_index = 0;
- if (folio_test_writeback(folio)) {
- if (wbc->sync_mode != WB_SYNC_NONE)
- folio_wait_writeback(folio);
- else
- goto continue_unlock;
- }
+ /*
+ * Return the first error we encountered (if there was any) to
+ * the caller.
+ */
+ *error = wbc->saved_err;
+ }
+ return folio;
- BUG_ON(folio_test_writeback(folio));
- if (!folio_clear_dirty_for_io(folio))
- goto continue_unlock;
+done:
+ if (wbc->range_cyclic)
+ mapping->writeback_index = folio->index + folio_nr_pages(folio);
+ folio_batch_release(&wbc->fbatch);
+ return NULL;
+}
- trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- error = writepage(folio, wbc, data);
- nr = folio_nr_pages(folio);
- if (unlikely(error)) {
- /*
- * Handle errors according to the type of
- * writeback. There's no need to continue for
- * background writeback. Just push done_index
- * past this page so media errors won't choke
- * writeout for the entire file. For integrity
- * writeback, we must process the entire dirty
- * set regardless of errors because the fs may
- * still have state to clear for each page. In
- * that case we continue processing and return
- * the first error.
- */
- if (error == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- error = 0;
- } else if (wbc->sync_mode != WB_SYNC_ALL) {
- ret = error;
- done_index = folio->index + nr;
- done = 1;
- break;
- }
- if (!ret)
- ret = error;
- }
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * Return: %0 on success, negative error code otherwise
+ *
+ * Note: please use writeback_iter() instead.
+ */
+int write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ struct folio *folio = NULL;
+ int error;
- /*
- * We stop writing back only if we are not doing
- * integrity sync. In case of integrity sync we have to
- * keep going until we have written all the pages
- * we tagged for writeback prior to entering this loop.
- */
- wbc->nr_to_write -= nr;
- if (wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- done = 1;
- break;
- }
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ error = writepage(folio, wbc, data);
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
+ folio_unlock(folio);
+ error = 0;
}
- folio_batch_release(&fbatch);
- cond_resched();
}
- /*
- * If we hit the last page and there is more work to be done: wrap
- * back the index back to the start of the file for the next
- * time we are called.
- */
- if (wbc->range_cyclic && !done)
- done_index = 0;
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = done_index;
-
- return ret;
+ return error;
}
EXPORT_SYMBOL(write_cache_pages);
-static int writepage_cb(struct folio *folio, struct writeback_control *wbc,
- void *data)
+static int writeback_use_writepage(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(&folio->page, wbc);
- mapping_set_error(mapping, ret);
- return ret;
+ struct folio *folio = NULL;
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+ err = mapping->a_ops->writepage(&folio->page, wbc);
+ if (err == AOP_WRITEPAGE_ACTIVATE) {
+ folio_unlock(folio);
+ err = 0;
+ }
+ mapping_set_error(mapping, err);
+ }
+ blk_finish_plug(&plug);
+
+ return err;
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
@@ -2552,12 +2611,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (mapping->a_ops->writepages) {
ret = mapping->a_ops->writepages(mapping, wbc);
} else if (mapping->a_ops->writepage) {
- struct blk_plug plug;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, writepage_cb,
- mapping);
- blk_finish_plug(&plug);
+ ret = writeback_use_writepage(mapping, wbc);
} else {
/* deal with chardevs and other special files */
ret = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a663202045..14d39f34d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -32,6 +32,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/pagevec.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmstat.h>
@@ -464,19 +465,19 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int __maybe_unused bad_range(struct zone *zone, struct page *page)
+static bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
- return 1;
+ return true;
if (zone != page_zone(page))
- return 1;
+ return true;
- return 0;
+ return false;
}
#else
-static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
+static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page)
{
- return 0;
+ return false;
}
#endif
@@ -1061,7 +1062,7 @@ out:
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return deferred_pages_enabled();
@@ -1080,11 +1081,11 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
-static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, fpi_t fpi_flags)
+__always_inline bool free_pages_prepare(struct page *page,
+ unsigned int order)
{
int bad = 0;
- bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
@@ -1266,7 +1267,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order, fpi_flags))
+ if (!free_pages_prepare(page, order))
return;
/*
@@ -1422,14 +1423,14 @@ static void check_new_page_bad(struct page *page)
/*
* This page is about to be returned from the page allocator
*/
-static int check_new_page(struct page *page)
+static bool check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
- return 0;
+ return false;
check_new_page_bad(page);
- return 1;
+ return true;
}
static inline bool check_new_pages(struct page *page, unsigned int order)
@@ -2343,7 +2344,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
{
int migratetype;
- if (!free_pages_prepare(page, order, FPI_NONE))
+ if (!free_pages_prepare(page, order))
return false;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -2515,66 +2516,70 @@ void free_unref_page(struct page *page, unsigned int order)
}
/*
- * Free a list of 0-order pages
+ * Free a batch of folios
*/
-void free_unref_page_list(struct list_head *list)
+void free_unref_folios(struct folio_batch *folios)
{
unsigned long __maybe_unused UP_flags;
- struct page *page, *next;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
- int batch_count = 0;
- int migratetype;
+ int i, j, migratetype;
- /* Prepare pages for freeing */
- list_for_each_entry_safe(page, next, list, lru) {
- unsigned long pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn, 0)) {
- list_del(&page->lru);
+ /* Prepare folios for freeing */
+ for (i = 0, j = 0; i < folios->nr; i++) {
+ struct folio *folio = folios->folios[i];
+ unsigned long pfn = folio_pfn(folio);
+ unsigned int order = folio_order(folio);
+
+ if (order > 0 && folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
+ if (!free_unref_page_prepare(&folio->page, pfn, order))
continue;
- }
/*
- * Free isolated pages directly to the allocator, see
- * comment in free_unref_page.
+ * Free isolated folios and orders not handled on the PCP
+ * directly to the allocator, see comment in free_unref_page.
*/
- migratetype = get_pcppage_migratetype(page);
- if (unlikely(is_migrate_isolate(migratetype))) {
- list_del(&page->lru);
- free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ migratetype = get_pcppage_migratetype(&folio->page);
+ if (!pcp_allowed_order(order) ||
+ is_migrate_isolate(migratetype)) {
+ free_one_page(folio_zone(folio), &folio->page, pfn,
+ order, migratetype, FPI_NONE);
continue;
}
+ folio->private = (void *)(unsigned long)order;
+ if (j != i)
+ folios->folios[j] = folio;
+ j++;
}
+ folios->nr = j;
- list_for_each_entry_safe(page, next, list, lru) {
- struct zone *zone = page_zone(page);
+ for (i = 0; i < folios->nr; i++) {
+ struct folio *folio = folios->folios[i];
+ struct zone *zone = folio_zone(folio);
+ unsigned int order = (unsigned long)folio->private;
- list_del(&page->lru);
- migratetype = get_pcppage_migratetype(page);
+ folio->private = NULL;
+ migratetype = get_pcppage_migratetype(&folio->page);
- /*
- * Either different zone requiring a different pcp lock or
- * excessive lock hold times when freeing a large list of
- * pages.
- */
- if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+ /* Different zone requires a different pcp lock */
+ if (zone != locked_zone) {
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
- batch_count = 0;
-
/*
- * trylock is necessary as pages may be getting freed
+ * trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) {
pcp_trylock_finish(UP_flags);
- free_one_page(zone, page, page_to_pfn(page),
- 0, migratetype, FPI_NONE);
+ free_one_page(zone, &folio->page,
+ folio_pfn(folio), order,
+ migratetype, FPI_NONE);
locked_zone = NULL;
continue;
}
@@ -2588,15 +2593,16 @@ void free_unref_page_list(struct list_head *list)
if (unlikely(migratetype >= MIGRATE_PCPTYPES))
migratetype = MIGRATE_MOVABLE;
- trace_mm_page_free_batched(page);
- free_unref_page_commit(zone, pcp, page, migratetype, 0);
- batch_count++;
+ trace_mm_page_free_batched(&folio->page);
+ free_unref_page_commit(zone, pcp, &folio->page, migratetype,
+ order);
}
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
}
+ folio_batch_reinit(folios);
}
/*
@@ -2616,8 +2622,8 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, 1 << order);
- split_page_memcg(page, 1 << order);
+ split_page_owner(page, order, 0);
+ split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -4687,8 +4693,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
- __GFP_NOMEMALLOC;
+ gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
@@ -4701,6 +4707,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
return page;
}
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+ if (!nc->va)
+ return;
+
+ __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+ nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
@@ -4710,9 +4726,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
}
EXPORT_SYMBOL(__page_frag_cache_drain);
-void *page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -4781,7 +4797,7 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_align);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
@@ -4803,8 +4819,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *page = virt_to_page((void *)addr);
struct page *last = page + nr;
- split_page_owner(page, 1 << order);
- split_page_memcg(page, 1 << order);
+ split_page_owner(page, order, 0);
+ split_page_memcg(page, order, 0);
while (page < --last)
set_page_refcounted(last);
@@ -5574,37 +5590,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
mutex_unlock(&pcp_batch_high_lock);
}
-static void zone_pcp_update_cacheinfo(struct zone *zone)
+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
- int cpu;
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
- for_each_online_cpu(cpu) {
- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- cci = get_cpu_cacheinfo(cpu);
- /*
- * If data cache slice of CPU is large enough, "pcp->batch"
- * pages can be preserved in PCP before draining PCP for
- * consecutive high-order pages freeing without allocation.
- * This can reduce zone lock contention without hurting
- * cache-hot pages sharing.
- */
- spin_lock(&pcp->lock);
- if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
- pcp->flags |= PCPF_FREE_HIGH_BATCH;
- else
- pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- spin_unlock(&pcp->lock);
- }
+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ cci = get_cpu_cacheinfo(cpu);
+ /*
+ * If data cache slice of CPU is large enough, "pcp->batch"
+ * pages can be preserved in PCP before draining PCP for
+ * consecutive high-order pages freeing without allocation.
+ * This can reduce zone lock contention without hurting
+ * cache-hot pages sharing.
+ */
+ spin_lock(&pcp->lock);
+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+ pcp->flags |= PCPF_FREE_HIGH_BATCH;
+ else
+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+ spin_unlock(&pcp->lock);
}
-void setup_pcp_cacheinfo(void)
+void setup_pcp_cacheinfo(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone)
- zone_pcp_update_cacheinfo(zone);
+ zone_pcp_update_cacheinfo(zone, cpu);
}
/*
@@ -5847,7 +5860,7 @@ static void __setup_per_zone_wmarks(void)
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone_managed_pages(zone);
- do_div(tmp, lowmem_pages);
+ tmp = div64_ul(tmp, lowmem_pages);
if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -6221,9 +6234,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/* [start, end) must belong to a single zone. */
+/*
+ * [start, end) must belong to a single zone.
+ * @migratetype: using migratetype to filter the type of migration in
+ * trace_mm_alloc_contig_migrate_range_info.
+ */
int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int migratetype)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6234,6 +6252,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
.nid = zone_to_nid(cc->zone),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ struct page *page;
+ unsigned long total_mapped = 0;
+ unsigned long total_migrated = 0;
+ unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6259,9 +6281,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
+ if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
+ total_reclaimed += nr_reclaimed;
+ list_for_each_entry(page, &cc->migratepages, lru)
+ total_mapped += page_mapcount(page);
+ }
+
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
+ if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
+ total_migrated += cc->nr_migratepages;
+
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6275,9 +6306,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
- return ret;
}
- return 0;
+
+ trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
+ total_migrated,
+ total_reclaimed,
+ total_mapped);
+ return (ret < 0) ? ret : 0;
}
/**
@@ -6357,7 +6392,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end);
+ ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
if (ret && ret != -EBUSY)
goto done;
ret = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index cd0ea36682..a5c8fa4c2a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -434,7 +434,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
}
ret = __alloc_contig_migrate_range(&cc, head_pfn,
- head_pfn + nr_pages);
+ head_pfn + nr_pages, page_mt);
/*
* restore the page's migratetype so that it can
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5634e5d890..8eed0f3dc0 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -36,6 +36,15 @@ struct page_owner {
pid_t free_tgid;
};
+struct stack {
+ struct stack_record *stack_record;
+ struct stack *next;
+};
+static struct stack dummy_stack;
+static struct stack failure_stack;
+static struct stack *stack_list;
+static DEFINE_SPINLOCK(stack_list_lock);
+
static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
@@ -45,6 +54,22 @@ static depot_stack_handle_t early_handle;
static void init_early_allocated_pages(void);
+static inline void set_current_in_page_owner(void)
+{
+ /*
+ * Avoid recursion.
+ *
+ * We might need to allocate more memory from page_owner code, so make
+ * sure to signal it in order to avoid recursion.
+ */
+ current->in_page_owner = 1;
+}
+
+static inline void unset_current_in_page_owner(void)
+{
+ current->in_page_owner = 0;
+}
+
static int __init early_page_owner_param(char *buf)
{
int ret = kstrtobool(buf, &page_owner_enabled);
@@ -93,8 +118,17 @@ static __init void init_page_owner(void)
register_dummy_stack();
register_failure_stack();
register_early_stack();
- static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
+ /* Initialize dummy and failure stacks and link them to stack_list */
+ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
+ failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
+ if (dummy_stack.stack_record)
+ refcount_set(&dummy_stack.stack_record->count, 1);
+ if (failure_stack.stack_record)
+ refcount_set(&failure_stack.stack_record->count, 1);
+ dummy_stack.next = &failure_stack;
+ stack_list = &dummy_stack;
+ static_branch_enable(&page_owner_inited);
}
struct page_ext_operations page_owner_ops = {
@@ -115,81 +149,177 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
depot_stack_handle_t handle;
unsigned int nr_entries;
- /*
- * Avoid recursion.
- *
- * Sometimes page metadata allocation tracking requires more
- * memory to be allocated:
- * - when new stack trace is saved to stack depot
- */
if (current->in_page_owner)
return dummy_handle;
- current->in_page_owner = 1;
+ set_current_in_page_owner();
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
+ unset_current_in_page_owner();
- current->in_page_owner = 0;
return handle;
}
-void __reset_page_owner(struct page *page, unsigned short order)
+static void add_stack_record_to_list(struct stack_record *stack_record,
+ gfp_t gfp_mask)
{
- int i;
- struct page_ext *page_ext;
- depot_stack_handle_t handle;
- struct page_owner *page_owner;
- u64 free_ts_nsec = local_clock();
+ unsigned long flags;
+ struct stack *stack;
+
+ /* Filter gfp_mask the same way stackdepot does, for consistency */
+ gfp_mask &= ~GFP_ZONEMASK;
+ gfp_mask &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP);
+ gfp_mask |= __GFP_NOWARN;
+
+ set_current_in_page_owner();
+ stack = kmalloc(sizeof(*stack), gfp_mask);
+ if (!stack) {
+ unset_current_in_page_owner();
+ return;
+ }
+ unset_current_in_page_owner();
- page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
+ stack->stack_record = stack_record;
+ stack->next = NULL;
+
+ spin_lock_irqsave(&stack_list_lock, flags);
+ stack->next = stack_list;
+ /*
+ * This pairs with smp_load_acquire() from function
+ * stack_start(). This guarantees that stack_start()
+ * will see an updated stack_list before starting to
+ * traverse the list.
+ */
+ smp_store_release(&stack_list, stack);
+ spin_unlock_irqrestore(&stack_list_lock, flags);
+}
+
+static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
+ int nr_base_pages)
+{
+ struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+ if (!stack_record)
return;
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- for (i = 0; i < (1 << order); i++) {
- __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_owner = get_page_owner(page_ext);
- page_owner->free_handle = handle;
- page_owner->free_ts_nsec = free_ts_nsec;
- page_owner->free_pid = current->pid;
- page_owner->free_tgid = current->tgid;
- page_ext = page_ext_next(page_ext);
+ /*
+ * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
+ * with REFCOUNT_SATURATED to catch spurious increments of their
+ * refcount.
+ * Since we do not use STACK_DEPOT_FLAG_GET API, let us
+ * set a refcount of 1 ourselves.
+ */
+ if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
+ int old = REFCOUNT_SATURATED;
+
+ if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
+ /* Add the new stack_record to our list */
+ add_stack_record_to_list(stack_record, gfp_mask);
}
- page_ext_put(page_ext);
+ refcount_add(nr_base_pages, &stack_record->count);
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle,
- unsigned short order, gfp_t gfp_mask)
+static void dec_stack_record_count(depot_stack_handle_t handle,
+ int nr_base_pages)
+{
+ struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+ if (!stack_record)
+ return;
+
+ if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
+ pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
+ handle);
+}
+
+static inline void __update_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned short order,
+ gfp_t gfp_mask,
+ short last_migrate_reason, u64 ts_nsec,
+ pid_t pid, pid_t tgid, char *comm)
{
- struct page_owner *page_owner;
int i;
- u64 ts_nsec = local_clock();
+ struct page_owner *page_owner;
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
- page_owner->pid = current->pid;
- page_owner->tgid = current->tgid;
+ page_owner->last_migrate_reason = last_migrate_reason;
+ page_owner->pid = pid;
+ page_owner->tgid = tgid;
page_owner->ts_nsec = ts_nsec;
- strscpy(page_owner->comm, current->comm,
+ strscpy(page_owner->comm, comm,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned short order,
+ pid_t pid, pid_t tgid,
+ u64 free_ts_nsec)
+{
+ int i;
+ struct page_owner *page_owner;
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ /* Only __reset_page_owner() wants to clear the bit */
+ if (handle) {
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner->free_handle = handle;
+ }
+ page_owner->free_ts_nsec = free_ts_nsec;
+ page_owner->free_pid = current->pid;
+ page_owner->free_tgid = current->tgid;
page_ext = page_ext_next(page_ext);
}
}
+void __reset_page_owner(struct page *page, unsigned short order)
+{
+ struct page_ext *page_ext;
+ depot_stack_handle_t handle;
+ depot_stack_handle_t alloc_handle;
+ struct page_owner *page_owner;
+ u64 free_ts_nsec = local_clock();
+
+ page_ext = page_ext_get(page);
+ if (unlikely(!page_ext))
+ return;
+
+ page_owner = get_page_owner(page_ext);
+ alloc_handle = page_owner->handle;
+
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ __update_page_owner_free_handle(page_ext, handle, order, current->pid,
+ current->tgid, free_ts_nsec);
+ page_ext_put(page_ext);
+
+ if (alloc_handle != early_handle)
+ /*
+ * early_handle is being set as a handle for all those
+ * early allocated pages. See init_pages_in_zone().
+ * Since their refcount is not being incremented because
+ * the machinery is not ready yet, we cannot decrement
+ * their refcount either.
+ */
+ dec_stack_record_count(alloc_handle, 1 << order);
+}
+
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
struct page_ext *page_ext;
+ u64 ts_nsec = local_clock();
depot_stack_handle_t handle;
handle = save_stack(gfp_mask);
@@ -197,8 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
+ ts_nsec, current->pid, current->tgid,
+ current->comm);
page_ext_put(page_ext);
+ inc_stack_record_count(handle, gfp_mask, 1 << order);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -214,7 +347,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_ext_put(page_ext);
}
-void __split_page_owner(struct page *page, unsigned int nr)
+void __split_page_owner(struct page *page, int old_order, int new_order)
{
int i;
struct page_ext *page_ext = page_ext_get(page);
@@ -223,9 +356,9 @@ void __split_page_owner(struct page *page, unsigned int nr)
if (unlikely(!page_ext))
return;
- for (i = 0; i < nr; i++) {
+ for (i = 0; i < (1 << old_order); i++) {
page_owner = get_page_owner(page_ext);
- page_owner->order = 0;
+ page_owner->order = new_order;
page_ext = page_ext_next(page_ext);
}
page_ext_put(page_ext);
@@ -233,9 +366,12 @@ void __split_page_owner(struct page *page, unsigned int nr)
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
+ int i;
struct page_ext *old_ext;
struct page_ext *new_ext;
- struct page_owner *old_page_owner, *new_page_owner;
+ struct page_owner *old_page_owner;
+ struct page_owner *new_page_owner;
+ depot_stack_handle_t migrate_handle;
old_ext = page_ext_get(&old->page);
if (unlikely(!old_ext))
@@ -249,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext);
- new_page_owner->order = old_page_owner->order;
- new_page_owner->gfp_mask = old_page_owner->gfp_mask;
- new_page_owner->last_migrate_reason =
- old_page_owner->last_migrate_reason;
- new_page_owner->handle = old_page_owner->handle;
- new_page_owner->pid = old_page_owner->pid;
- new_page_owner->tgid = old_page_owner->tgid;
- new_page_owner->free_pid = old_page_owner->free_pid;
- new_page_owner->free_tgid = old_page_owner->free_tgid;
- new_page_owner->ts_nsec = old_page_owner->ts_nsec;
- new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
- strcpy(new_page_owner->comm, old_page_owner->comm);
-
+ migrate_handle = new_page_owner->handle;
+ __update_page_owner_handle(new_ext, old_page_owner->handle,
+ old_page_owner->order, old_page_owner->gfp_mask,
+ old_page_owner->last_migrate_reason,
+ old_page_owner->ts_nsec, old_page_owner->pid,
+ old_page_owner->tgid, old_page_owner->comm);
/*
- * We don't clear the bit on the old folio as it's going to be freed
- * after migration. Until then, the info can be useful in case of
- * a bug, and the overall stats will be off a bit only temporarily.
- * Also, migrate_misplaced_transhuge_page() can still fail the
- * migration and then we want the old folio to retain the info. But
- * in that case we also don't need to explicitly clear the info from
- * the new page, which will be freed.
+ * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
+ * will be freed after migration. Keep them until then as they may be
+ * useful.
*/
- __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
- __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
+ old_page_owner->free_pid,
+ old_page_owner->free_tgid,
+ old_page_owner->free_ts_nsec);
+ /*
+ * We linked the original stack to the new folio, we need to do the same
+ * for the new one and the old folio otherwise there will be an imbalance
+ * when subtracting those pages from the stack.
+ */
+ for (i = 0; i < (1 << new_page_owner->order); i++) {
+ old_page_owner->handle = migrate_handle;
+ old_ext = page_ext_next(old_ext);
+ old_page_owner = get_page_owner(old_ext);
+ }
+
page_ext_put(new_ext);
page_ext_put(old_ext);
}
@@ -680,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle,
- 0, 0);
+ __update_page_owner_handle(page_ext, early_handle, 0, 0,
+ -1, local_clock(), current->pid,
+ current->tgid, current->comm);
count++;
ext_put_continue:
page_ext_put(page_ext);
@@ -719,8 +858,109 @@ static const struct file_operations proc_page_owner_operations = {
.llseek = lseek_page_owner,
};
+static void *stack_start(struct seq_file *m, loff_t *ppos)
+{
+ struct stack *stack;
+
+ if (*ppos == -1UL)
+ return NULL;
+
+ if (!*ppos) {
+ /*
+ * This pairs with smp_store_release() from function
+ * add_stack_record_to_list(), so we get a consistent
+ * value of stack_list.
+ */
+ stack = smp_load_acquire(&stack_list);
+ m->private = stack;
+ } else {
+ stack = m->private;
+ }
+
+ return stack;
+}
+
+static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+ struct stack *stack = v;
+
+ stack = stack->next;
+ *ppos = stack ? *ppos + 1 : -1UL;
+ m->private = stack;
+
+ return stack;
+}
+
+static unsigned long page_owner_pages_threshold;
+
+static int stack_print(struct seq_file *m, void *v)
+{
+ int i, nr_base_pages;
+ struct stack *stack = v;
+ unsigned long *entries;
+ unsigned long nr_entries;
+ struct stack_record *stack_record = stack->stack_record;
+
+ if (!stack->stack_record)
+ return 0;
+
+ nr_entries = stack_record->size;
+ entries = stack_record->entries;
+ nr_base_pages = refcount_read(&stack_record->count) - 1;
+
+ if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
+ return 0;
+
+ for (i = 0; i < nr_entries; i++)
+ seq_printf(m, " %pS\n", (void *)entries[i]);
+ seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
+
+ return 0;
+}
+
+static void stack_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations page_owner_stack_op = {
+ .start = stack_start,
+ .next = stack_next,
+ .stop = stack_stop,
+ .show = stack_print
+};
+
+static int page_owner_stack_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &page_owner_stack_op, 0);
+}
+
+static const struct file_operations page_owner_stack_operations = {
+ .open = page_owner_stack_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int page_owner_threshold_get(void *data, u64 *val)
+{
+ *val = READ_ONCE(page_owner_pages_threshold);
+ return 0;
+}
+
+static int page_owner_threshold_set(void *data, u64 val)
+{
+ WRITE_ONCE(page_owner_pages_threshold, val);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
+ &page_owner_threshold_set, "%llu");
+
+
static int __init pageowner_init(void)
{
+ struct dentry *dir;
+
if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
@@ -728,6 +968,11 @@ static int __init pageowner_init(void)
debugfs_create_file("page_owner", 0400, NULL, NULL,
&proc_page_owner_operations);
+ dir = debugfs_create_dir("page_owner_stacks", NULL);
+ debugfs_create_file("show_stacks", 0400, dir, NULL,
+ &page_owner_stack_operations);
+ debugfs_create_file("count_threshold", 0600, dir, NULL,
+ &proc_page_owner_threshold);
return 0;
}
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index af69c3c8f7..6363f93a47 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -71,6 +71,9 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+ if (!page_ext)
+ return;
+
BUG_ON(PageSlab(page));
anon = PageAnon(page);
@@ -108,6 +111,9 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+ if (!page_ext)
+ return;
+
BUG_ON(PageSlab(page));
anon = PageAnon(page);
@@ -138,7 +144,10 @@ void __page_table_check_zero(struct page *page, unsigned int order)
BUG_ON(PageSlab(page));
page_ext = page_ext_get(page);
- BUG_ON(!page_ext);
+
+ if (!page_ext)
+ return;
+
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc..a78a4adf71 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 03c1bdae4a..106e1d66e9 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
+#include <linux/debugfs.h>
#include <linux/ptdump.h>
#include <linux/kasan.h>
@@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
}
+
+static int check_wx_show(struct seq_file *m, void *v)
+{
+ if (ptdump_check_wx())
+ seq_puts(m, "SUCCESS\n");
+ else
+ seq_puts(m, "FAILED\n");
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(check_wx);
+
+static int ptdump_debugfs_init(void)
+{
+ debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
+
+ return 0;
+}
+
+device_initcall(ptdump_debugfs_init);
diff --git a/mm/readahead.c b/mm/readahead.c
index 89139a8721..d55138e956 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -501,10 +501,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
if (new_order < MAX_PAGECACHE_ORDER) {
new_order += 2;
- if (new_order > MAX_PAGECACHE_ORDER)
- new_order = MAX_PAGECACHE_ORDER;
- while ((1 << new_order) > ra->size)
- new_order--;
+ new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ new_order = min_t(unsigned int, new_order, ilog2(ra->size));
}
/* See comment in page_cache_ra_unbounded() */
@@ -519,9 +517,6 @@ void page_cache_ra_order(struct readahead_control *ractl,
/* Don't allocate pages past EOF */
while (index + (1UL << order) - 1 > limit)
order--;
- /* THP machinery does not support order-1 */
- if (order == 1)
- order = 0;
err = ra_alloc_folio(ractl, index, mark, order, gfp);
if (err)
break;
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad5..3746a55310 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else if (folio_test_anon(folio)) {
swp_entry_t entry = page_swap_entry(subpage);
pte_t swp_pte;
@@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*
* See Documentation/mm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(&folio->page));
+ dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
if (unlikely(folio_test_hugetlb(folio)))
@@ -2169,7 +2169,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
set_huge_pte_at(mm, address, pvmw.pte, pteval,
hsz);
} else {
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(&folio->page));
+ dec_mm_counter(mm, mm_counter(folio));
} else {
swp_entry_t entry;
pte_t swp_pte;
@@ -2261,7 +2261,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
- compound_order(&folio->page));
+ folio_order(folio));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
diff --git a/mm/shmem.c b/mm/shmem.c
index 935f3647bb..b66e99e2c7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -254,7 +254,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-const struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
@@ -263,6 +263,12 @@ static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->a_ops == &shmem_aops;
+}
+EXPORT_SYMBOL_GPL(shmem_mapping);
+
bool vma_is_anon_shmem(struct vm_area_struct *vma)
{
return vma->vm_ops == &shmem_anon_vm_ops;
@@ -1780,7 +1786,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- mem_cgroup_migrate(old, new);
+ mem_cgroup_replace_folio(old, new);
__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
@@ -1960,6 +1966,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
int error;
bool alloced;
+ if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
+ return -EINVAL;
+
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
@@ -2122,12 +2131,36 @@ unlock:
return error;
}
+/**
+ * shmem_get_folio - find, and lock a shmem folio.
+ * @inode: inode to search
+ * @index: the page index.
+ * @foliop: pointer to the folio if found
+ * @sgp: SGP_* flags to control behavior
+ *
+ * Looks up the page cache entry at @inode & @index. If a folio is
+ * present, it is returned locked with an increased refcount.
+ *
+ * If the caller modifies data in the folio, it must call folio_mark_dirty()
+ * before unlocking the folio to ensure that the folio is not reclaimed.
+ * There is no need to reserve space before calling folio_mark_dirty().
+ *
+ * When no folio is found, the behavior depends on @sgp:
+ * - for SGP_READ, *@foliop is %NULL and 0 is returned
+ * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
+ * - for all other flags a new folio is allocated, inserted into the
+ * page cache and returned locked in @foliop.
+ *
+ * Context: May sleep.
+ * Return: 0 if successful, else a negative error code.
+ */
int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
enum sgp_type sgp)
{
return shmem_get_folio_gfp(inode, index, foliop, sgp,
mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
+EXPORT_SYMBOL_GPL(shmem_get_folio);
/*
* This is like autoremove_wake_function, but it removes the wait queue
@@ -3493,10 +3526,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &shmem_aops;
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error)
goto out_remove_offset;
- inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
folio_mark_uptodate(folio);
@@ -4258,6 +4291,24 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
mpol_put(mpol);
if (sbinfo->noswap)
seq_printf(seq, ",noswap");
+#ifdef CONFIG_TMPFS_QUOTA
+ if (sb_has_quota_active(root->d_sb, USRQUOTA))
+ seq_printf(seq, ",usrquota");
+ if (sb_has_quota_active(root->d_sb, GRPQUOTA))
+ seq_printf(seq, ",grpquota");
+ if (sbinfo->qlimits.usrquota_bhardlimit)
+ seq_printf(seq, ",usrquota_block_hardlimit=%lld",
+ sbinfo->qlimits.usrquota_bhardlimit);
+ if (sbinfo->qlimits.grpquota_bhardlimit)
+ seq_printf(seq, ",grpquota_block_hardlimit=%lld",
+ sbinfo->qlimits.grpquota_bhardlimit);
+ if (sbinfo->qlimits.usrquota_ihardlimit)
+ seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
+ sbinfo->qlimits.usrquota_ihardlimit);
+ if (sbinfo->qlimits.grpquota_ihardlimit)
+ seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
+ sbinfo->qlimits.grpquota_ihardlimit);
+#endif
return 0;
}
@@ -4348,7 +4399,9 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
- uuid_gen(&sb->s_uuid);
+ uuid_t uuid;
+ uuid_gen(&uuid);
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
#ifdef CONFIG_TMPFS_QUOTA
if (ctx->seen & SHMEM_SEEN_QUOTA) {
@@ -4459,7 +4512,7 @@ static int shmem_error_remove_folio(struct address_space *mapping,
return 0;
}
-const struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
@@ -4471,7 +4524,6 @@ const struct address_space_operations shmem_aops = {
#endif
.error_remove_folio = shmem_error_remove_folio,
};
-EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
@@ -4826,6 +4878,7 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon
{
return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
+EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
@@ -4903,7 +4956,6 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- BUG_ON(!shmem_mapping(mapping));
error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL);
if (error)
diff --git a/mm/slab.h b/mm/slab.h
index 54deeb0428..d2bc9b1912 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -363,7 +363,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache,
enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
- PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */
};
@@ -387,7 +386,7 @@ extern const struct kmalloc_info_struct {
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
-void create_kmalloc_caches(slab_flags_t);
+void create_kmalloc_caches(void);
extern u8 kmalloc_size_index[24];
@@ -422,8 +421,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags);
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
void __init kmem_cache_init(void);
-void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type,
- slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize);
@@ -435,8 +432,7 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name);
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);
static inline bool is_kmalloc_cache(struct kmem_cache *s)
{
@@ -469,7 +465,6 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
SLAB_STORE_USER | \
SLAB_TRACE | \
SLAB_CONSISTENCY_CHECKS | \
- SLAB_MEM_SPREAD | \
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
@@ -528,7 +523,7 @@ static inline bool __slub_debug_enabled(void)
#endif
/*
- * Returns true if any of the specified slub_debug flags is enabled for the
+ * Returns true if any of the specified slab_debug flags is enabled for the
* cache. Use only for flags parsed by setup_slub_debug() as it also enables
* the static key.
*/
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 238293b1db..f5234672f0 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -50,7 +50,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge())
+ SLAB_FAILSLAB | SLAB_NO_MERGE)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -172,7 +172,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name);
+ flags = kmem_cache_flags(flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
@@ -282,7 +282,7 @@ kmem_cache_create_usercopy(const char *name,
#ifdef CONFIG_SLUB_DEBUG
/*
- * If no slub_debug was enabled globally, the static key is not yet
+ * If no slab_debug was enabled globally, the static key is not yet
* enabled by setup_slub_debug(). Enable it if the cache is being
* created with any of the debugging flags passed explicitly.
* It's also possible that this is the first cache created with
@@ -404,8 +404,12 @@ EXPORT_SYMBOL(kmem_cache_create);
*/
static void kmem_cache_release(struct kmem_cache *s)
{
- sysfs_slab_unlink(s);
- sysfs_slab_release(s);
+ if (slab_state >= FULL) {
+ sysfs_slab_unlink(s);
+ sysfs_slab_release(s);
+ } else {
+ slab_kmem_cache_release(s);
+ }
}
#else
static void kmem_cache_release(struct kmem_cache *s)
@@ -651,7 +655,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
-{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
+{ /* initialization for https://llvm.org/pr42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
@@ -766,7 +770,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
}
/*
- * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
* kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
* kmalloc-2M.
*/
@@ -853,9 +857,10 @@ static unsigned int __kmalloc_minalign(void)
return max(minalign, arch_slab_minalign());
}
-void __init
-new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
+static void __init
+new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
{
+ slab_flags_t flags = 0;
unsigned int minalign = __kmalloc_minalign();
unsigned int aligned_size = kmalloc_info[idx].size;
int aligned_idx = idx;
@@ -902,7 +907,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
* may already have been created because they were needed to
* enable allocations for slab creation.
*/
-void __init create_kmalloc_caches(slab_flags_t flags)
+void __init create_kmalloc_caches(void)
{
int i;
enum kmalloc_cache_type type;
@@ -913,7 +918,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
- new_kmalloc_cache(i, type, flags);
+ new_kmalloc_cache(i, type);
/*
* Caches that are not of the two-to-the-power-of size.
@@ -922,10 +927,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
!kmalloc_caches[type][1])
- new_kmalloc_cache(1, type, flags);
+ new_kmalloc_cache(1, type);
if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
!kmalloc_caches[type][2])
- new_kmalloc_cache(2, type, flags);
+ new_kmalloc_cache(2, type);
}
}
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
diff --git a/mm/slub.c b/mm/slub.c
index 914c35a7d4..24f702afd4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -295,7 +295,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/*
* Debugging flags that require metadata to be stored in the slab. These get
- * disabled when slub_debug=O is used and a cache's min order increases with
+ * disabled when slab_debug=O is used and a cache's min order increases with
* metadata.
*/
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -306,13 +306,13 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/* Internal SLUB flags */
/* Poison object */
-#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
+#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
/* Use cmpxchg_double */
#ifdef system_has_freelist_aba
-#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
#else
-#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED
#endif
/*
@@ -391,7 +391,7 @@ struct kmem_cache_cpu {
};
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct slab *partial; /* Partially allocated frozen slabs */
+ struct slab *partial; /* Partially allocated slabs */
#endif
local_lock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
@@ -1498,16 +1498,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
{
struct kmem_cache_node *n = get_node(s, node);
- /*
- * May be called early in order to allocate a slab for the
- * kmem_cache_node structure. Solve the chicken-egg
- * dilemma by deferring the increment of the count during
- * bootstrap (see early_kmem_cache_node_alloc).
- */
- if (likely(n)) {
- atomic_long_inc(&n->nr_slabs);
- atomic_long_add(objects, &n->total_objects);
- }
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
}
static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
{
@@ -1616,7 +1608,7 @@ static inline int free_consistency_checks(struct kmem_cache *s,
}
/*
- * Parse a block of slub_debug options. Blocks are delimited by ';'
+ * Parse a block of slab_debug options. Blocks are delimited by ';'
*
* @str: start of block
* @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
@@ -1677,7 +1669,7 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
break;
default:
if (init)
- pr_err("slub_debug option '%c' unknown. skipped\n", *str);
+ pr_err("slab_debug option '%c' unknown. skipped\n", *str);
}
}
check_slabs:
@@ -1736,7 +1728,7 @@ static int __init setup_slub_debug(char *str)
/*
* For backwards compatibility, a single list of flags with list of
* slabs means debugging is only changed for those slabs, so the global
- * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+ * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
* on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
* long as there is no option specifying flags without a slab list.
*/
@@ -1760,21 +1752,20 @@ out:
return 1;
}
-__setup("slub_debug", setup_slub_debug);
+__setup("slab_debug", setup_slub_debug);
+__setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
/*
* kmem_cache_flags - apply debugging options to the cache
- * @object_size: the size of an object without meta data
* @flags: flags to set
* @name: name of the cache
*
* Debug option(s) are applied to @flags. In addition to the debug
* option(s), if a slab name (or multiple) is specified i.e.
- * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
* then only the select slabs will receive the debug option(s).
*/
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name)
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
{
char *iter;
size_t len;
@@ -1850,8 +1841,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name)
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
{
return flags;
}
@@ -2038,11 +2028,6 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
obj_cgroup_uncharge(objcg, objects * obj_full_size(s));
}
#else /* CONFIG_MEMCG_KMEM */
-static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
-{
- return NULL;
-}
-
static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
@@ -2248,7 +2233,7 @@ static void __init init_freelist_randomization(void)
}
/* Get the next entry on the pre-computed freelist randomized */
-static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
+static void *next_freelist_entry(struct kmem_cache *s,
unsigned long *pos, void *start,
unsigned long page_limit,
unsigned long freelist_count)
@@ -2287,13 +2272,12 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
start = fixup_red_left(s, slab_address(slab));
/* First entry is used as the base of the freelist */
- cur = next_freelist_entry(s, slab, &pos, start, page_limit,
- freelist_count);
+ cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
cur = setup_object(s, cur);
slab->freelist = cur;
for (idx = 1; idx < slab->objects; idx++) {
- next = next_freelist_entry(s, slab, &pos, start, page_limit,
+ next = next_freelist_entry(s, &pos, start, page_limit,
freelist_count);
next = setup_object(s, next);
set_freepointer(s, cur, next);
@@ -3268,7 +3252,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
oo_order(s->min));
if (oo_order(s->min) > get_order(s->object_size))
- pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
+ pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n",
s->name);
for_each_kmem_cache_node(s, node, n) {
@@ -3331,7 +3315,6 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
counters = slab->counters;
new.counters = counters;
- VM_BUG_ON(!new.frozen);
new.inuse = slab->objects;
new.frozen = freelist != NULL;
@@ -3503,18 +3486,20 @@ new_slab:
slab = slub_percpu_partial(c);
slub_set_percpu_partial(c, slab);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- stat(s, CPU_PARTIAL_ALLOC);
- if (unlikely(!node_match(slab, node) ||
- !pfmemalloc_match(slab, gfpflags))) {
- slab->next = NULL;
- __put_partials(s, slab);
- continue;
+ if (likely(node_match(slab, node) &&
+ pfmemalloc_match(slab, gfpflags))) {
+ c->slab = slab;
+ freelist = get_freelist(s, slab);
+ VM_BUG_ON(!freelist);
+ stat(s, CPU_PARTIAL_ALLOC);
+ goto load_freelist;
}
- freelist = freeze_slab(s, slab);
- goto retry_load_slab;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ slab->next = NULL;
+ __put_partials(s, slab);
}
#endif
@@ -3798,11 +3783,11 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
zero_size = orig_size;
/*
- * When slub_debug is enabled, avoid memory initialization integrated
+ * When slab_debug is enabled, avoid memory initialization integrated
* into KASAN and instead zero out the memory via the memset below with
* the proper size. Otherwise, KASAN might overwrite SLUB redzones and
* cause false-positive reports. This does not lead to a performance
- * penalty on production builds, as slub_debug is not intended to be
+ * penalty on production builds, as slab_debug is not intended to be
* enabled there.
*/
if (__slub_debug_enabled())
@@ -4193,7 +4178,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- remove_full(s, n, slab);
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -4207,9 +4191,6 @@ slab_empty:
*/
remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
- } else {
- /* Slab must be on the full list */
- remove_full(s, n, slab);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4708,8 +4689,8 @@ static unsigned int slub_min_objects;
* activity on the partial lists which requires taking the list_lock. This is
* less a concern for large slabs though which are rarely used.
*
- * slub_max_order specifies the order where we begin to stop considering the
- * number of objects in a slab as critical. If we reach slub_max_order then
+ * slab_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slab_max_order then
* we try to keep the page order as low as possible. So we accept more waste
* of space in favor of a small page order.
*
@@ -4776,14 +4757,14 @@ static inline int calculate_order(unsigned int size)
* and backing off gradually.
*
* We start with accepting at most 1/16 waste and try to find the
- * smallest order from min_objects-derived/slub_min_order up to
- * slub_max_order that will satisfy the constraint. Note that increasing
+ * smallest order from min_objects-derived/slab_min_order up to
+ * slab_max_order that will satisfy the constraint. Note that increasing
* the order can only result in same or less fractional waste, not more.
*
* If that fails, we increase the acceptable fraction of waste and try
* again. The last iteration with fraction of 1/2 would effectively
* accept any waste and give us the order determined by min_objects, as
- * long as at least single object fits within slub_max_order.
+ * long as at least single object fits within slab_max_order.
*/
for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
order = calc_slab_order(size, min_order, slub_max_order,
@@ -4793,7 +4774,7 @@ static inline int calculate_order(unsigned int size)
}
/*
- * Doh this slab cannot be placed using slub_max_order.
+ * Doh this slab cannot be placed using slab_max_order.
*/
order = get_order(size);
if (order <= MAX_PAGE_ORDER)
@@ -4863,7 +4844,6 @@ static void early_kmem_cache_node_alloc(int node)
slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!slab);
- inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
@@ -5110,7 +5090,7 @@ static int calculate_sizes(struct kmem_cache *s)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name);
+ s->flags = kmem_cache_flags(flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
@@ -5319,7 +5299,9 @@ static int __init setup_slub_min_order(char *str)
return 1;
}
-__setup("slub_min_order=", setup_slub_min_order);
+__setup("slab_min_order=", setup_slub_min_order);
+__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
+
static int __init setup_slub_max_order(char *str)
{
@@ -5332,7 +5314,8 @@ static int __init setup_slub_max_order(char *str)
return 1;
}
-__setup("slub_max_order=", setup_slub_max_order);
+__setup("slab_max_order=", setup_slub_max_order);
+__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
static int __init setup_slub_min_objects(char *str)
{
@@ -5341,7 +5324,8 @@ static int __init setup_slub_min_objects(char *str)
return 1;
}
-__setup("slub_min_objects=", setup_slub_min_objects);
+__setup("slab_min_objects=", setup_slub_min_objects);
+__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -5669,7 +5653,7 @@ void __init kmem_cache_init(void)
/* Now we can use the kmem_cache to allocate kmalloc slabs */
setup_kmalloc_cache_index_table();
- create_kmalloc_caches(0);
+ create_kmalloc_caches();
/* Setup random freelists for each cache */
init_freelist_randomization();
@@ -6798,14 +6782,12 @@ out_del_kobj:
void sysfs_slab_unlink(struct kmem_cache *s)
{
- if (slab_state >= FULL)
- kobject_del(&s->kobj);
+ kobject_del(&s->kobj);
}
void sysfs_slab_release(struct kmem_cache *s)
{
- if (slab_state >= FULL)
- kobject_put(&s->kobj);
+ kobject_put(&s->kobj);
}
/*
diff --git a/mm/sparse.c b/mm/sparse.c
index 338cf946de..aed0951b87 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(memmap, sizeof(struct page) * nr_pages);
+ if (!altmap || !altmap->inaccessible)
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swap.c b/mm/swap.c
index cd8f0150ba..500a09a48d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -74,22 +74,21 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock),
};
-/*
- * This path almost never happens for VM activity - pages are normally freed
- * in batches. But it gets used by networking - and for compound pages.
- */
-static void __page_cache_release(struct folio *folio)
+static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
+ unsigned long *flagsp)
{
if (folio_test_lru(folio)) {
- struct lruvec *lruvec;
- unsigned long flags;
-
- lruvec = folio_lruvec_lock_irqsave(folio, &flags);
- lruvec_del_folio(lruvec, folio);
+ folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
+ lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
- unlock_page_lruvec_irqrestore(lruvec, flags);
}
- /* See comment on folio_test_mlocked in release_pages() */
+
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
if (unlikely(folio_test_mlocked(folio))) {
long nr_pages = folio_nr_pages(folio);
@@ -99,9 +98,23 @@ static void __page_cache_release(struct folio *folio)
}
}
+/*
+ * This path almost never happens for VM activity - pages are normally freed
+ * in batches. But it gets used by networking - and for compound pages.
+ */
+static void page_cache_release(struct folio *folio)
+{
+ struct lruvec *lruvec = NULL;
+ unsigned long flags;
+
+ __page_cache_release(folio, &lruvec, &flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+}
+
static void __folio_put_small(struct folio *folio)
{
- __page_cache_release(folio);
+ page_cache_release(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, 0);
}
@@ -115,7 +128,7 @@ static void __folio_put_large(struct folio *folio)
* be called for hugetlb (it has a separate hugetlb_cgroup.)
*/
if (!folio_test_hugetlb(folio))
- __page_cache_release(folio);
+ page_cache_release(folio);
destroy_large_folio(folio);
}
@@ -138,22 +151,25 @@ EXPORT_SYMBOL(__folio_put);
*/
void put_pages_list(struct list_head *pages)
{
+ struct folio_batch fbatch;
struct folio *folio, *next;
+ folio_batch_init(&fbatch);
list_for_each_entry_safe(folio, next, pages, lru) {
- if (!folio_put_testzero(folio)) {
- list_del(&folio->lru);
+ if (!folio_put_testzero(folio))
continue;
- }
if (folio_test_large(folio)) {
- list_del(&folio->lru);
__folio_put_large(folio);
continue;
}
/* LRU flag must be clear because it's passed using the lru */
+ if (folio_batch_add(&fbatch, folio) > 0)
+ continue;
+ free_unref_folios(&fbatch);
}
- free_unref_page_list(pages);
+ if (fbatch.nr)
+ free_unref_folios(&fbatch);
INIT_LIST_HEAD(pages);
}
EXPORT_SYMBOL(put_pages_list);
@@ -175,7 +191,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
* while the LRU lock is held.
*
* (That is not true of __page_cache_release(), and not necessarily
- * true of release_pages(): but those only clear the mlocked flag after
+ * true of folios_put(): but those only clear the mlocked flag after
* folio_put_testzero() has excluded any other users of the folio.)
*/
if (folio_evictable(folio)) {
@@ -213,7 +229,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
continue;
- lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+ folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
folio_set_lru(folio);
@@ -221,8 +237,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
- folios_put(fbatch->folios, folio_batch_count(fbatch));
- folio_batch_reinit(fbatch);
+ folios_put(fbatch);
}
static void folio_batch_add_and_move(struct folio_batch *fbatch,
@@ -946,41 +961,29 @@ void lru_cache_disable(void)
}
/**
- * release_pages - batched put_page()
- * @arg: array of pages to release
- * @nr: number of pages
+ * folios_put_refs - Reduce the reference count on a batch of folios.
+ * @folios: The folios.
+ * @refs: The number of refs to subtract from each folio.
*
- * Decrement the reference count on all the pages in @arg. If it
- * fell to zero, remove the page from the LRU and free it.
+ * Like folio_put(), but for a batch of folios. This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed. The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need
+ * to reinitialise it. If @refs is NULL, we subtract one from each
+ * folio refcount.
*
- * Note that the argument can be an array of pages, encoded pages,
- * or folio pointers. We ignore any encoded bits, and turn any of
- * them into just a folio that gets free'd.
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context. May be called while holding a spinlock.
*/
-void release_pages(release_pages_arg arg, int nr)
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{
- int i;
- struct encoded_page **encoded = arg.encoded_pages;
- LIST_HEAD(pages_to_free);
+ int i, j;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
- unsigned int lock_batch;
- for (i = 0; i < nr; i++) {
- struct folio *folio;
-
- /* Turn any of the argument types into a folio */
- folio = page_folio(encoded_page_ptr(encoded[i]));
-
- /*
- * Make sure the IRQ-safe lock-holding time does not get
- * excessive with a continuous string of pages from the
- * same lruvec. The lock is held only if lruvec != NULL.
- */
- if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
- lruvec = NULL;
- }
+ for (i = 0, j = 0; i < folios->nr; i++) {
+ struct folio *folio = folios->folios[i];
+ unsigned int nr_refs = refs ? refs[i] : 1;
if (is_huge_zero_page(&folio->page))
continue;
@@ -990,56 +993,85 @@ void release_pages(release_pages_arg arg, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(&folio->page))
+ if (put_devmap_managed_page_refs(&folio->page, nr_refs))
continue;
- if (folio_put_testzero(folio))
+ if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_page(&folio->page);
continue;
}
- if (!folio_put_testzero(folio))
+ if (!folio_ref_sub_and_test(folio, nr_refs))
continue;
- if (folio_test_large(folio)) {
+ /* hugetlb has its own memcg */
+ if (folio_test_hugetlb(folio)) {
if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- __folio_put_large(folio);
+ free_huge_folio(folio);
continue;
}
+ if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
- if (folio_test_lru(folio)) {
- struct lruvec *prev_lruvec = lruvec;
+ __page_cache_release(folio, &lruvec, &flags);
- lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
- &flags);
- if (prev_lruvec != lruvec)
- lock_batch = 0;
+ if (j != i)
+ folios->folios[j] = folio;
+ j++;
+ }
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ if (!j) {
+ folio_batch_reinit(folios);
+ return;
+ }
- lruvec_del_folio(lruvec, folio);
- __folio_clear_lru_flags(folio);
- }
+ folios->nr = j;
+ mem_cgroup_uncharge_folios(folios);
+ free_unref_folios(folios);
+}
+EXPORT_SYMBOL(folios_put_refs);
- /*
- * In rare cases, when truncation or holepunching raced with
- * munlock after VM_LOCKED was cleared, Mlocked may still be
- * found set here. This does not indicate a problem, unless
- * "unevictable_pgs_cleared" appears worryingly large.
- */
- if (unlikely(folio_test_mlocked(folio))) {
- __folio_clear_mlocked(folio);
- zone_stat_sub_folio(folio, NR_MLOCK);
- count_vm_event(UNEVICTABLE_PGCLEARED);
- }
+/**
+ * release_pages - batched put_page()
+ * @arg: array of pages to release
+ * @nr: number of pages
+ *
+ * Decrement the reference count on all the pages in @arg. If it
+ * fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
+ */
+void release_pages(release_pages_arg arg, int nr)
+{
+ struct folio_batch fbatch;
+ int refs[PAGEVEC_SIZE];
+ struct encoded_page **encoded = arg.encoded_pages;
+ int i;
+
+ folio_batch_init(&fbatch);
+ for (i = 0; i < nr; i++) {
+ /* Turn any of the argument types into a folio */
+ struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
+
+ /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+ refs[fbatch.nr] = 1;
+ if (unlikely(encoded_page_flags(encoded[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
- list_add(&folio->lru, &pages_to_free);
+ if (folio_batch_add(&fbatch, folio) > 0)
+ continue;
+ folios_put_refs(&fbatch, refs);
}
- if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
- mem_cgroup_uncharge_list(&pages_to_free);
- free_unref_page_list(&pages_to_free);
+ if (fbatch.nr)
+ folios_put_refs(&fbatch, refs);
}
EXPORT_SYMBOL(release_pages);
@@ -1059,8 +1091,7 @@ void __folio_batch_release(struct folio_batch *fbatch)
lru_add_drain();
fbatch->percpu_pvec_drained = true;
}
- release_pages(fbatch->folios, folio_batch_count(fbatch));
- folio_batch_reinit(fbatch);
+ folios_put(fbatch);
}
EXPORT_SYMBOL(__folio_batch_release);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f..90973ce788 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
+ /* Large folio swap slot is not covered. */
+ zswap_invalidate(entry);
+
cache = raw_cpu_ptr(&swp_slots);
if (likely(use_swap_slot_cache && cache->slots_ret)) {
spin_lock_irq(&cache->free_lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7255c01a1e..bfc7e8c58a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/migrate.h>
@@ -282,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
* folio_free_swap() _with_ the lock.
* - Marcelo
*/
-void free_swap_cache(struct page *page)
+void free_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
folio_trylock(folio)) {
folio_free_swap(folio);
@@ -299,9 +298,11 @@ void free_swap_cache(struct page *page)
*/
void free_page_and_swap_cache(struct page *page)
{
- free_swap_cache(page);
+ struct folio *folio = page_folio(page);
+
+ free_swap_cache(folio);
if (!is_huge_zero_page(page))
- put_page(page);
+ folio_put(folio);
}
/*
@@ -310,10 +311,25 @@ void free_page_and_swap_cache(struct page *page)
*/
void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
+ struct folio_batch folios;
+ unsigned int refs[PAGEVEC_SIZE];
+
lru_add_drain();
- for (int i = 0; i < nr; i++)
- free_swap_cache(encoded_page_ptr(pages[i]));
- release_pages(pages, nr);
+ folio_batch_init(&folios);
+ for (int i = 0; i < nr; i++) {
+ struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+
+ free_swap_cache(folio);
+ refs[folios.nr] = 1;
+ if (unlikely(encoded_page_flags(pages[i]) &
+ ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+ refs[folios.nr] = encoded_nr_pages(pages[++i]);
+
+ if (folio_batch_add(&folios, folio) == 0)
+ folios_put_refs(&folios, refs);
+ }
+ if (folios.nr)
+ folios_put_refs(&folios, refs);
}
static inline bool swap_use_vma_readahead(void)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6fe0cc2553..4919423cce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
- atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -746,12 +744,19 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify = NULL;
while (offset <= end) {
arch_swap_invalidate_page(si->type, offset);
- zswap_invalidate(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
offset++;
}
clear_shadow_from_swap_cache(si->type, begin, end);
+
+ /*
+ * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
+ * only after the above cleanups are done.
+ */
+ smp_wmb();
+ atomic_long_add(nr_entries, &nr_swap_pages);
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -2060,7 +2065,7 @@ static int try_to_unuse(unsigned int type)
unsigned int i;
if (!READ_ONCE(si->inuse_pages))
- return 0;
+ goto success;
retry:
retval = shmem_unuse(type);
@@ -2141,6 +2146,12 @@ retry:
return -EINTR;
}
+success:
+ /*
+ * Make sure that further cleanups after try_to_unuse() returns happen
+ * after swap_range_free() reduces si->inuse_pages to 0.
+ */
+ smp_mb();
return 0;
}
@@ -2359,8 +2370,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- zswap_swapon(p->type);
-
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -2543,10 +2552,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
exit_swap_address_space(p->type);
inode = mapping->host;
- if (p->bdev_handle) {
+ if (p->bdev_file) {
set_blocksize(p->bdev, old_block_size);
- bdev_release(p->bdev_handle);
- p->bdev_handle = NULL;
+ fput(p->bdev_file);
+ p->bdev_file = NULL;
}
inode_lock(inode);
@@ -2776,14 +2785,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
- p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
+ p->bdev_file = bdev_file_open_by_dev(inode->i_rdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
- if (IS_ERR(p->bdev_handle)) {
- error = PTR_ERR(p->bdev_handle);
- p->bdev_handle = NULL;
+ if (IS_ERR(p->bdev_file)) {
+ error = PTR_ERR(p->bdev_file);
+ p->bdev_file = NULL;
return error;
}
- p->bdev = p->bdev_handle->bdev;
+ p->bdev = file_bdev(p->bdev_file);
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
@@ -3178,6 +3187,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (error)
goto bad_swap_unlock_inode;
+ error = zswap_swapon(p->type, maxpages);
+ if (error)
+ goto free_swap_address_space;
+
/*
* Flush any pending IO and dirty mappings before we start using this
* swap device.
@@ -3186,7 +3199,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto free_swap_address_space;
+ goto free_swap_zswap;
}
mutex_lock(&swapon_mutex);
@@ -3210,6 +3223,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_zswap:
+ zswap_swapoff(p->type);
free_swap_address_space:
exit_swap_address_space(p->type);
bad_swap_unlock_inode:
@@ -3219,10 +3234,10 @@ bad_swap:
p->percpu_cluster = NULL;
free_percpu(p->cluster_next_cpu);
p->cluster_next_cpu = NULL;
- if (p->bdev_handle) {
+ if (p->bdev_file) {
set_blocksize(p->bdev, p->old_block_size);
- bdev_release(p->bdev_handle);
- p->bdev_handle = NULL;
+ fput(p->bdev_file);
+ p->bdev_file = NULL;
}
inode = NULL;
destroy_swap_extents(p);
@@ -3331,7 +3346,8 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
} else
err = -ENOENT; /* unused swap entry */
- WRITE_ONCE(p->swap_map[offset], count | has_cache);
+ if (!err)
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
unlock_out:
unlock_cluster_or_swap_info(p, ci);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index aa8f5a6c32..829f7b1089 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,19 +20,11 @@
#include "internal.h"
static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
- /*
- * Make sure that the dst range is both valid and fully within a
- * single existing vma.
- */
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
- if (!range_in_vma(dst_vma, dst_start, dst_start + len))
- return NULL;
+ /* Make sure that the dst range is fully within dst_vma. */
+ if (dst_end > dst_vma->vm_end)
+ return false;
/*
* Check the vma is registered in uffd, this is required to
@@ -40,11 +32,122 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ return false;
+
+ return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ vma = ERR_PTR(-ENOENT);
+ else if (!(vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(vma)))
+ vma = ERR_PTR(-ENOMEM);
+
+ return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock. vma should be unlocked after use
+ * with unlock_vma().
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+ vma = lock_vma_under_rcu(mm, address);
+ if (vma) {
+ /*
+ * lock_vma_under_rcu() only checks anon_vma for private
+ * anonymous mappings. But we need to ensure it is assigned in
+ * private file-backed vmas as well.
+ */
+ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
+ vma_end_read(vma);
+ else
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = find_vma_and_prepare_anon(mm, address);
+ if (!IS_ERR(vma)) {
+ /*
+ * We cannot use vma_start_read() as it may fail due to
+ * false locked (see comment in vma_start_read()). We
+ * can avoid that by directly locking vm_lock under
+ * mmap_lock, which guarantees that nobody can lock the
+ * vma for write (vma_start_write()) under us.
+ */
+ down_read(&vma->vm_lock->lock);
+ }
+
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = lock_vma(dst_mm, dst_start);
+ if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ vma_end_read(dst_vma);
+ return ERR_PTR(-ENOENT);
+}
+
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ vma_end_read(vma);
+}
+
+#else
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ mmap_read_lock(dst_mm);
+ dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+ if (IS_ERR(dst_vma))
+ goto out_unlock;
+
+ if (validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ dst_vma = ERR_PTR(-ENOENT);
+out_unlock:
+ mmap_read_unlock(dst_mm);
return dst_vma;
}
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ mmap_read_unlock(vma->vm_mm);
+}
+#endif
+
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -124,7 +227,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
* Must happen after rmap, as mm_counter() checks mapping (via
* PageAnon()), which is set by __page_set_anon_rmap().
*/
- inc_mm_counter(dst_mm, mm_counter(page));
+ inc_mm_counter(dst_mm, mm_counter(folio));
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -385,18 +488,18 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
+ struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
- int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
unsigned long src_addr, dst_addr;
@@ -414,7 +517,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* feature is not supported.
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
return -EINVAL;
}
@@ -437,24 +541,28 @@ retry:
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
+
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
- goto out_unlock;
-
- vm_shared = dst_vma->vm_flags & VM_SHARED;
- }
+ goto out_unlock_vma;
- /*
- * If not shared, ensure the dst_vma has a anon_vma.
- */
- err = -ENOMEM;
- if (!vm_shared) {
- if (unlikely(anon_vma_prepare(dst_vma)))
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
}
@@ -498,7 +606,8 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
err = copy_folio_from_user(folio,
@@ -507,16 +616,6 @@ retry:
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- if (mmap_changing && atomic_read(mmap_changing)) {
- err = -EAGAIN;
- break;
- }
dst_vma = NULL;
goto retry;
@@ -536,7 +635,9 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -547,11 +648,11 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -599,13 +700,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
return err;
}
-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@@ -628,24 +729,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
copied = 0;
folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
- goto out_unlock;
-
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -EINVAL;
@@ -668,8 +769,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
- len, mmap_changing, flags);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -677,16 +778,6 @@ retry:
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
- /*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
- */
- err = -ENOMEM;
- if (!(dst_vma->vm_flags & VM_SHARED) &&
- unlikely(anon_vma_prepare(dst_vma)))
- goto out_unlock;
-
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
@@ -728,7 +819,8 @@ retry:
if (unlikely(err == -ENOENT)) {
void *kaddr;
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
BUG_ON(!folio);
kaddr = kmap_local_folio(folio, 0);
@@ -758,7 +850,8 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
out:
if (folio)
folio_put(folio);
@@ -768,34 +861,42 @@ out:
return copied ? copied : err;
}
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags)
+ uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ return mfill_atomic(ctx, dst_start, src_start, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long start,
+ unsigned long len)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+
+ /*
+ * A caller might reasonably assume that UFFDIO_CONTINUE contains an
+ * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
+ * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
+ * subsequent loads from the page through the newly mapped address range.
+ */
+ smp_wmb();
+
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
}
@@ -828,10 +929,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp)
{
+ struct mm_struct *dst_mm = ctx->mm;
unsigned long end = start + len;
unsigned long _start, _end;
struct vm_area_struct *dst_vma;
@@ -855,8 +956,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
@@ -885,6 +987,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
}
out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}
@@ -994,6 +1097,33 @@ static int move_swap_pte(struct mm_struct *mm,
return 0;
}
+static int move_zeropage_pte(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+ pte_t zero_pte;
+
+ double_pt_lock(dst_ptl, src_ptl);
+ if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+ !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+
+ zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ptep_clear_flush(src_vma, src_addr, src_pte);
+ set_pte_at(mm, dst_addr, dst_pte, zero_pte);
+ double_pt_unlock(dst_ptl, src_ptl);
+
+ return 0;
+}
+
+
/*
* The mmap_lock for reading is held by the caller. Just move the page
* from src_pmd to dst_pmd if possible, and return true if succeeded
@@ -1076,6 +1206,14 @@ retry:
}
if (pte_present(orig_src_pte)) {
+ if (is_zero_pfn(pte_pfn(orig_src_pte))) {
+ err = move_zeropage_pte(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte,
+ dst_ptl, src_ptl);
+ goto out;
+ }
+
/*
* Pin and lock both source folio and anon_vma. Since we are in
* RCU read section, we can't block, so on contention have to
@@ -1259,27 +1397,137 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
return -EINVAL;
+ return 0;
+}
+
+static __always_inline
+int find_vmas_mm_locked(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = find_vma_and_prepare_anon(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
+ /* Skip finding src_vma if src_start is in dst_vma */
+ if (src_start >= vma->vm_start && src_start < vma->vm_end)
+ goto out_success;
+
+ vma = vma_lookup(mm, src_start);
+ if (!vma)
+ return -ENOENT;
+out_success:
+ *src_vmap = vma;
+ return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ vma = lock_vma(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
/*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
+ * Skip finding src_vma if src_start is in dst_vma. This also ensures
+ * that we don't lock the same vma twice.
*/
- if (unlikely(anon_vma_prepare(dst_vma)))
- return -ENOMEM;
+ if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+ *src_vmap = vma;
+ return 0;
+ }
- return 0;
+ /*
+ * Using lock_vma() to get src_vma can lead to following deadlock:
+ *
+ * Thread1 Thread2
+ * ------- -------
+ * vma_start_read(dst_vma)
+ * mmap_write_lock(mm)
+ * vma_start_write(src_vma)
+ * vma_start_read(src_vma)
+ * mmap_read_lock(mm)
+ * vma_start_write(dst_vma)
+ */
+ *src_vmap = lock_vma_under_rcu(mm, src_start);
+ if (likely(*src_vmap))
+ return 0;
+
+ /* Undo any locking and retry in mmap_lock critical section */
+ vma_end_read(*dst_vmap);
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (!err) {
+ /*
+ * See comment in lock_vma() as to why not using
+ * vma_start_read() here.
+ */
+ down_read(&(*dst_vmap)->vm_lock->lock);
+ if (*dst_vmap != *src_vmap)
+ down_read_nested(&(*src_vmap)->vm_lock->lock,
+ SINGLE_DEPTH_NESTING);
+ }
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ vma_end_read(src_vma);
+ if (src_vma != dst_vma)
+ vma_end_read(dst_vma);
+}
+
+#else
+
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ int err;
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (err)
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ mmap_assert_locked(src_vma->vm_mm);
+ mmap_read_unlock(dst_vma->vm_mm);
}
+#endif
/**
* move_pages - move arbitrary anonymous pages of an existing vma
* @ctx: pointer to the userfaultfd context
- * @mm: the address space to move pages
* @dst_start: start of the destination virtual memory range
* @src_start: start of the source virtual memory range
* @len: length of the virtual memory range
* @mode: flags from uffdio_move.mode
*
- * Must be called with mmap_lock held for read.
+ * It will either use the mmap_lock in read mode or per-vma locks
*
* move_pages() remaps arbitrary anonymous pages atomically in zero
* copy. It only works on non shared anonymous pages because those can
@@ -1347,10 +1595,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* could be obtained. This is the only additional complexity added to
* the rmap code to provide this anonymous page remapping functionality.
*/
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 mode)
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 mode)
{
+ struct mm_struct *mm = ctx->mm;
struct vm_area_struct *src_vma, *dst_vma;
unsigned long src_addr, dst_addr;
pmd_t *src_pmd, *dst_pmd;
@@ -1368,28 +1616,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
WARN_ON_ONCE(dst_start + len <= dst_start))
goto out;
+ err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
+ if (err)
+ goto out;
+
+ /* Re-check after taking map_changing_lock */
+ err = -EAGAIN;
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing)))
+ goto out_unlock;
/*
* Make sure the vma is not shared, that the src and dst remap
* ranges are both valid and fully within a single existing
* vma.
*/
- src_vma = find_vma(mm, src_start);
- if (!src_vma || (src_vma->vm_flags & VM_SHARED))
- goto out;
- if (src_start < src_vma->vm_start ||
- src_start + len > src_vma->vm_end)
- goto out;
+ err = -EINVAL;
+ if (src_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (src_start + len > src_vma->vm_end)
+ goto out_unlock;
- dst_vma = find_vma(mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out;
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- goto out;
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
+ if (dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
err = validate_move_areas(ctx, src_vma, dst_vma);
if (err)
- goto out;
+ goto out_unlock;
for (src_addr = src_start, dst_addr = dst_start;
src_addr < src_start + len;) {
@@ -1439,19 +1693,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
err = -ENOENT;
break;
}
- /* Avoid moving zeropages for now */
- if (is_huge_zero_pmd(*src_pmd)) {
- spin_unlock(ptl);
- err = -EBUSY;
- break;
- }
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
- if (!folio || !PageAnonExclusive(&folio->page)) {
+ if (!folio || (!is_huge_zero_page(&folio->page) &&
+ !PageAnonExclusive(&folio->page))) {
spin_unlock(ptl);
err = -EBUSY;
break;
@@ -1511,6 +1760,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
moved += step_size;
}
+out_unlock:
+ up_read(&ctx->map_changing_lock);
+ uffd_move_unlock(dst_vma, src_vma);
out:
VM_WARN_ON(moved < 0);
VM_WARN_ON(err > 0);
diff --git a/mm/util.c b/mm/util.c
index 5a6a980258..6693972357 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -136,6 +136,23 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kmemdup_array - duplicate a given array.
+ *
+ * @src: array to duplicate.
+ * @element_size: size of each element of array.
+ * @count: number of elements to duplicate from array.
+ * @gfp: GFP mask to use.
+ *
+ * Return: duplicated array of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
+ */
+void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp)
+{
+ return kmemdup(src, size_mul(element_size, count), gfp);
+}
+EXPORT_SYMBOL(kmemdup_array);
+
+/**
* kvmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -942,6 +959,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
+ unsigned long bytes_failed;
vm_acct_memory(pages);
@@ -976,8 +994,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
- pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
- __func__, current->pid, current->comm);
+ bytes_failed = pages << PAGE_SHIFT;
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
+ __func__, current->pid, current->comm, bytes_failed);
vm_unacct_memory(pages);
return -ENOMEM;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c..109272b8ee 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -304,8 +304,8 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
return err;
}
-int ioremap_page_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+int vmap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
int err;
@@ -318,6 +318,26 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
return err;
}
+int ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ struct vm_struct *area;
+
+ area = find_vm_area((void *)addr);
+ if (!area || !(area->flags & VM_IOREMAP)) {
+ WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
+ return -EINVAL;
+ }
+ if (addr != (unsigned long)area->addr ||
+ (void *)end != area->addr + get_vm_area_size(area)) {
+ WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
+ addr, end, (long)area->addr,
+ (long)area->addr + get_vm_area_size(area));
+ return -ERANGE;
+ }
+ return vmap_page_range(addr, end, phys_addr, prot);
+}
+
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
@@ -635,6 +655,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
return err;
}
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ might_sleep();
+ if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+ return -EINVAL;
+ if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+ return -EINVAL;
+ if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+ return -EINVAL;
+ if ((end - start) >> PAGE_SHIFT > totalram_pages())
+ return -E2BIG;
+ if (start < (unsigned long)area->addr ||
+ (void *)end > area->addr + get_vm_area_size(area))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end, struct page **pages)
+{
+ int err;
+
+ err = check_sparse_vm_area(area, start, end);
+ if (err)
+ return err;
+
+ return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ if (check_sparse_vm_area(area, start, end))
+ return;
+
+ vunmap_range(start, end);
+}
+
int is_vmalloc_or_module_addr(const void *x)
{
/*
@@ -728,17 +800,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
-static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
-static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;
-static struct rb_root purge_vmap_area_root = RB_ROOT;
-static LIST_HEAD(purge_vmap_area_list);
-static DEFINE_SPINLOCK(purge_vmap_area_lock);
-
/*
* This kmem_cache is used for vmap_area objects. Instead of
* allocating from slab we reuse an object from this cache to
@@ -772,6 +836,129 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
*/
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+/*
+ * This structure defines a single, solid model where a list and
+ * rb-tree are part of one entity protected by the lock. Nodes are
+ * sorted in ascending order, thus for O(1) access to left/right
+ * neighbors a list is used as well as for sequential traversal.
+ */
+struct rb_list {
+ struct rb_root root;
+ struct list_head head;
+ spinlock_t lock;
+};
+
+/*
+ * A fast size storage contains VAs up to 1M size. A pool consists
+ * of linked between each other ready to go VAs of certain sizes.
+ * An index in the pool-array corresponds to number of pages + 1.
+ */
+#define MAX_VA_SIZE_PAGES 256
+
+struct vmap_pool {
+ struct list_head head;
+ unsigned long len;
+};
+
+/*
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
+ */
+static struct vmap_node {
+ /* Simple size segregated storage. */
+ struct vmap_pool pool[MAX_VA_SIZE_PAGES];
+ spinlock_t pool_lock;
+ bool skip_populate;
+
+ /* Bookkeeping data of this node. */
+ struct rb_list busy;
+ struct rb_list lazy;
+
+ /*
+ * Ready-to-free areas.
+ */
+ struct list_head purge_list;
+ struct work_struct purge_work;
+ unsigned long nr_purged;
+} single;
+
+/*
+ * Initial setup consists of one single node, i.e. a balancing
+ * is fully disabled. Later on, after vmap is initialized these
+ * parameters are updated based on a system capacity.
+ */
+static struct vmap_node *vmap_nodes = &single;
+static __read_mostly unsigned int nr_vmap_nodes = 1;
+static __read_mostly unsigned int vmap_zone_size = 1;
+
+static inline unsigned int
+addr_to_node_id(unsigned long addr)
+{
+ return (addr / vmap_zone_size) % nr_vmap_nodes;
+}
+
+static inline struct vmap_node *
+addr_to_node(unsigned long addr)
+{
+ return &vmap_nodes[addr_to_node_id(addr)];
+}
+
+static inline struct vmap_node *
+id_to_node(unsigned int id)
+{
+ return &vmap_nodes[id % nr_vmap_nodes];
+}
+
+/*
+ * We use the value 0 to represent "no node", that is why
+ * an encoded value will be the node-id incremented by 1.
+ * It is always greater then 0. A valid node_id which can
+ * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
+ * is not valid 0 is returned.
+ */
+static unsigned int
+encode_vn_id(unsigned int node_id)
+{
+ /* Can store U8_MAX [0:254] nodes. */
+ if (node_id < nr_vmap_nodes)
+ return (node_id + 1) << BITS_PER_BYTE;
+
+ /* Warn and no node encoded. */
+ WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
+ return 0;
+}
+
+/*
+ * Returns an encoded node-id, the valid range is within
+ * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
+ * returned if extracted data is wrong.
+ */
+static unsigned int
+decode_vn_id(unsigned int val)
+{
+ unsigned int node_id = (val >> BITS_PER_BYTE) - 1;
+
+ /* Can store U8_MAX [0:254] nodes. */
+ if (node_id < nr_vmap_nodes)
+ return node_id;
+
+ /* If it was _not_ zero, warn. */
+ WARN_ONCE(node_id != UINT_MAX,
+ "Decode wrong node id (%d)\n", node_id);
+
+ return nr_vmap_nodes;
+}
+
+static bool
+is_vn_id_valid(unsigned int node_id)
+{
+ if (node_id < nr_vmap_nodes)
+ return true;
+
+ return false;
+}
+
static __always_inline unsigned long
va_size(struct vmap_area *va)
{
@@ -802,11 +989,33 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
+ while (n) {
+ struct vmap_area *va;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (addr < va->va_start)
+ n = n->rb_left;
+ else if (addr >= va->va_end)
+ n = n->rb_right;
+ else
+ return va;
+ }
+
+ return NULL;
+}
+
/* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+static struct vmap_area *
+__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
struct vmap_area *va = NULL;
- struct rb_node *n = vmap_area_root.rb_node;
+ struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
@@ -827,22 +1036,49 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
return va;
}
-static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
+/*
+ * Returns a node where a first VA, that satisfies addr < va_end, resides.
+ * If success, a node is locked. A user is responsible to unlock it when a
+ * VA is no longer needed to be accessed.
+ *
+ * Returns NULL if nothing found.
+ */
+static struct vmap_node *
+find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
- struct rb_node *n = root->rb_node;
+ unsigned long va_start_lowest;
+ struct vmap_node *vn;
+ int i;
- addr = (unsigned long)kasan_reset_tag((void *)addr);
+repeat:
+ for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
+ vn = &vmap_nodes[i];
- while (n) {
- struct vmap_area *va;
+ spin_lock(&vn->busy.lock);
+ *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);
- va = rb_entry(n, struct vmap_area, rb_node);
- if (addr < va->va_start)
- n = n->rb_left;
- else if (addr >= va->va_end)
- n = n->rb_right;
- else
- return va;
+ if (*va)
+ if (!va_start_lowest || (*va)->va_start < va_start_lowest)
+ va_start_lowest = (*va)->va_start;
+ spin_unlock(&vn->busy.lock);
+ }
+
+ /*
+ * Check if found VA exists, it might have gone away. In this case we
+ * repeat the search because a VA has been removed concurrently and we
+ * need to proceed to the next one, which is a rare case.
+ */
+ if (va_start_lowest) {
+ vn = addr_to_node(va_start_lowest);
+
+ spin_lock(&vn->busy.lock);
+ *va = __find_vmap_area(va_start_lowest, &vn->busy.root);
+
+ if (*va)
+ return vn;
+
+ spin_unlock(&vn->busy.lock);
+ goto repeat;
}
return NULL;
@@ -1382,9 +1618,9 @@ classify_va_fit_type(struct vmap_area *va,
}
static __always_inline int
-adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
- struct vmap_area *va, unsigned long nva_start_addr,
- unsigned long size)
+va_clip(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size)
{
struct vmap_area *lva = NULL;
enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
@@ -1481,6 +1717,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
return 0;
}
+static unsigned long
+va_alloc(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend)
+{
+ unsigned long nva_start_addr;
+ int ret;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = va_clip(root, head, va, nva_start_addr, size);
+ if (WARN_ON_ONCE(ret))
+ return vend;
+
+ return nva_start_addr;
+}
+
/*
* Returns a start address of the newly allocated area, if success.
* Otherwise a vend is returned that indicates failure.
@@ -1493,7 +1755,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
- int ret;
/*
* Do not adjust when:
@@ -1511,18 +1772,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
if (unlikely(!va))
return vend;
- if (va->va_start > vstart)
- nva_start_addr = ALIGN(va->va_start, align);
- else
- nva_start_addr = ALIGN(vstart, align);
-
- /* Check the "vend" restriction. */
- if (nva_start_addr + size > vend)
- return vend;
-
- /* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
- if (WARN_ON_ONCE(ret))
+ nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
+ if (nva_start_addr == vend)
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
@@ -1537,12 +1788,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
*/
static void free_vmap_area(struct vmap_area *va)
{
+ struct vmap_node *vn = addr_to_node(va->va_start);
+
/*
* Remove from the busy tree/list.
*/
- spin_lock(&vmap_area_lock);
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&vn->busy.lock);
+ unlink_va(va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
/*
* Insert/Merge it back to the free tree/list.
@@ -1575,6 +1828,104 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
kmem_cache_free(vmap_area_cachep, va);
}
+static struct vmap_pool *
+size_to_va_pool(struct vmap_node *vn, unsigned long size)
+{
+ unsigned int idx = (size - 1) / PAGE_SIZE;
+
+ if (idx < MAX_VA_SIZE_PAGES)
+ return &vn->pool[idx];
+
+ return NULL;
+}
+
+static bool
+node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
+{
+ struct vmap_pool *vp;
+
+ vp = size_to_va_pool(n, va_size(va));
+ if (!vp)
+ return false;
+
+ spin_lock(&n->pool_lock);
+ list_add(&va->list, &vp->head);
+ WRITE_ONCE(vp->len, vp->len + 1);
+ spin_unlock(&n->pool_lock);
+
+ return true;
+}
+
+static struct vmap_area *
+node_pool_del_va(struct vmap_node *vn, unsigned long size,
+ unsigned long align, unsigned long vstart,
+ unsigned long vend)
+{
+ struct vmap_area *va = NULL;
+ struct vmap_pool *vp;
+ int err = 0;
+
+ vp = size_to_va_pool(vn, size);
+ if (!vp || list_empty(&vp->head))
+ return NULL;
+
+ spin_lock(&vn->pool_lock);
+ if (!list_empty(&vp->head)) {
+ va = list_first_entry(&vp->head, struct vmap_area, list);
+
+ if (IS_ALIGNED(va->va_start, align)) {
+ /*
+ * Do some sanity check and emit a warning
+ * if one of below checks detects an error.
+ */
+ err |= (va_size(va) != size);
+ err |= (va->va_start < vstart);
+ err |= (va->va_end > vend);
+
+ if (!WARN_ON_ONCE(err)) {
+ list_del_init(&va->list);
+ WRITE_ONCE(vp->len, vp->len - 1);
+ } else {
+ va = NULL;
+ }
+ } else {
+ list_move_tail(&va->list, &vp->head);
+ va = NULL;
+ }
+ }
+ spin_unlock(&vn->pool_lock);
+
+ return va;
+}
+
+static struct vmap_area *
+node_alloc(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ unsigned long *addr, unsigned int *vn_id)
+{
+ struct vmap_area *va;
+
+ *vn_id = 0;
+ *addr = vend;
+
+ /*
+ * Fallback to a global heap if not vmalloc or there
+ * is only one node.
+ */
+ if (vstart != VMALLOC_START || vend != VMALLOC_END ||
+ nr_vmap_nodes == 1)
+ return NULL;
+
+ *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
+ va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
+ *vn_id = encode_vn_id(*vn_id);
+
+ if (va)
+ *addr = va->va_start;
+
+ return va;
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1585,9 +1936,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask,
unsigned long va_flags)
{
+ struct vmap_node *vn;
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
+ unsigned int vn_id;
int purged = 0;
int ret;
@@ -1598,23 +1951,37 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
might_sleep();
- gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
- va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
- if (unlikely(!va))
- return ERR_PTR(-ENOMEM);
/*
- * Only scan the relevant parts containing pointers to other objects
- * to avoid false negatives.
+ * If a VA is obtained from a global heap(if it fails here)
+ * it is anyway marked with this "vn_id" so it is returned
+ * to this pool's node later. Such way gives a possibility
+ * to populate pools based on users demand.
+ *
+ * On success a ready to go VA is returned.
*/
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+ va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
+ if (!va) {
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
+ */
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+ }
retry:
- preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
- addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
- size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
+ if (addr == vend) {
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+ size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+ }
trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
@@ -1628,11 +1995,13 @@ retry:
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
- va->flags = va_flags;
+ va->flags = (va_flags | vn_id);
- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
+ vn = addr_to_node(va->va_start);
+
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ spin_unlock(&vn->busy.lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
@@ -1717,70 +2086,199 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
+static cpumask_t purge_nodes;
-/*
- * Purges all lazily-freed vmap areas.
- */
-static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
+static void
+reclaim_list_global(struct list_head *head)
{
- unsigned long resched_threshold;
- unsigned int num_purged_areas = 0;
- struct list_head local_purge_list;
- struct vmap_area *va, *n_va;
+ struct vmap_area *va, *n;
- lockdep_assert_held(&vmap_purge_lock);
+ if (list_empty(head))
+ return;
- spin_lock(&purge_vmap_area_lock);
- purge_vmap_area_root = RB_ROOT;
- list_replace_init(&purge_vmap_area_list, &local_purge_list);
- spin_unlock(&purge_vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
+ list_for_each_entry_safe(va, n, head, list)
+ merge_or_add_vmap_area_augment(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
+}
- if (unlikely(list_empty(&local_purge_list)))
- goto out;
+static void
+decay_va_pool_node(struct vmap_node *vn, bool full_decay)
+{
+ struct vmap_area *va, *nva;
+ struct list_head decay_list;
+ struct rb_root decay_root;
+ unsigned long n_decay;
+ int i;
- start = min(start,
- list_first_entry(&local_purge_list,
- struct vmap_area, list)->va_start);
+ decay_root = RB_ROOT;
+ INIT_LIST_HEAD(&decay_list);
- end = max(end,
- list_last_entry(&local_purge_list,
- struct vmap_area, list)->va_end);
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+ struct list_head tmp_list;
- flush_tlb_kernel_range(start, end);
- resched_threshold = lazy_max_pages() << 1;
+ if (list_empty(&vn->pool[i].head))
+ continue;
- spin_lock(&free_vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
- unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- unsigned long orig_start = va->va_start;
- unsigned long orig_end = va->va_end;
+ INIT_LIST_HEAD(&tmp_list);
+
+ /* Detach the pool, so no-one can access it. */
+ spin_lock(&vn->pool_lock);
+ list_replace_init(&vn->pool[i].head, &tmp_list);
+ spin_unlock(&vn->pool_lock);
+
+ if (full_decay)
+ WRITE_ONCE(vn->pool[i].len, 0);
+
+ /* Decay a pool by ~25% out of left objects. */
+ n_decay = vn->pool[i].len >> 2;
+
+ list_for_each_entry_safe(va, nva, &tmp_list, list) {
+ list_del_init(&va->list);
+ merge_or_add_vmap_area(va, &decay_root, &decay_list);
+
+ if (!full_decay) {
+ WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);
+
+ if (!--n_decay)
+ break;
+ }
+ }
/*
- * Finally insert or merge lazily-freed area. It is
- * detached and there is no need to "unlink" it from
- * anything.
+ * Attach the pool back if it has been partly decayed.
+ * Please note, it is supposed that nobody(other contexts)
+ * can populate the pool therefore a simple list replace
+ * operation takes place here.
*/
- va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
- &free_vmap_area_list);
+ if (!full_decay && !list_empty(&tmp_list)) {
+ spin_lock(&vn->pool_lock);
+ list_replace_init(&tmp_list, &vn->pool[i].head);
+ spin_unlock(&vn->pool_lock);
+ }
+ }
- if (!va)
- continue;
+ reclaim_list_global(&decay_list);
+}
+
+static void purge_vmap_node(struct work_struct *work)
+{
+ struct vmap_node *vn = container_of(work,
+ struct vmap_node, purge_work);
+ struct vmap_area *va, *n_va;
+ LIST_HEAD(local_list);
+
+ vn->nr_purged = 0;
+
+ list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long orig_start = va->va_start;
+ unsigned long orig_end = va->va_end;
+ unsigned int vn_id = decode_vn_id(va->flags);
+
+ list_del_init(&va->list);
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
- num_purged_areas++;
+ vn->nr_purged++;
+
+ if (is_vn_id_valid(vn_id) && !vn->skip_populate)
+ if (node_pool_add_va(vn, va))
+ continue;
- if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
- cond_resched_lock(&free_vmap_area_lock);
+ /* Go back to global. */
+ list_add(&va->list, &local_list);
}
- spin_unlock(&free_vmap_area_lock);
-out:
- trace_purge_vmap_area_lazy(start, end, num_purged_areas);
- return num_purged_areas > 0;
+ reclaim_list_global(&local_list);
+}
+
+/*
+ * Purges all lazily-freed vmap areas.
+ */
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
+ bool full_pool_decay)
+{
+ unsigned long nr_purged_areas = 0;
+ unsigned int nr_purge_helpers;
+ unsigned int nr_purge_nodes;
+ struct vmap_node *vn;
+ int i;
+
+ lockdep_assert_held(&vmap_purge_lock);
+
+ /*
+ * Use cpumask to mark which node has to be processed.
+ */
+ purge_nodes = CPU_MASK_NONE;
+
+ for (i = 0; i < nr_vmap_nodes; i++) {
+ vn = &vmap_nodes[i];
+
+ INIT_LIST_HEAD(&vn->purge_list);
+ vn->skip_populate = full_pool_decay;
+ decay_va_pool_node(vn, full_pool_decay);
+
+ if (RB_EMPTY_ROOT(&vn->lazy.root))
+ continue;
+
+ spin_lock(&vn->lazy.lock);
+ WRITE_ONCE(vn->lazy.root.rb_node, NULL);
+ list_replace_init(&vn->lazy.head, &vn->purge_list);
+ spin_unlock(&vn->lazy.lock);
+
+ start = min(start, list_first_entry(&vn->purge_list,
+ struct vmap_area, list)->va_start);
+
+ end = max(end, list_last_entry(&vn->purge_list,
+ struct vmap_area, list)->va_end);
+
+ cpumask_set_cpu(i, &purge_nodes);
+ }
+
+ nr_purge_nodes = cpumask_weight(&purge_nodes);
+ if (nr_purge_nodes > 0) {
+ flush_tlb_kernel_range(start, end);
+
+ /* One extra worker is per a lazy_max_pages() full set minus one. */
+ nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
+ nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;
+
+ for_each_cpu(i, &purge_nodes) {
+ vn = &vmap_nodes[i];
+
+ if (nr_purge_helpers > 0) {
+ INIT_WORK(&vn->purge_work, purge_vmap_node);
+
+ if (cpumask_test_cpu(i, cpu_online_mask))
+ schedule_work_on(i, &vn->purge_work);
+ else
+ schedule_work(&vn->purge_work);
+
+ nr_purge_helpers--;
+ } else {
+ vn->purge_work.func = NULL;
+ purge_vmap_node(&vn->purge_work);
+ nr_purged_areas += vn->nr_purged;
+ }
+ }
+
+ for_each_cpu(i, &purge_nodes) {
+ vn = &vmap_nodes[i];
+
+ if (vn->purge_work.func) {
+ flush_work(&vn->purge_work);
+ nr_purged_areas += vn->nr_purged;
+ }
+ }
+ }
+
+ trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
+ return nr_purged_areas > 0;
}
/*
@@ -1791,22 +2289,15 @@ static void reclaim_and_purge_vmap_areas(void)
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
- __purge_vmap_area_lazy(ULONG_MAX, 0);
+ __purge_vmap_area_lazy(ULONG_MAX, 0, true);
mutex_unlock(&vmap_purge_lock);
}
static void drain_vmap_area_work(struct work_struct *work)
{
- unsigned long nr_lazy;
-
- do {
- mutex_lock(&vmap_purge_lock);
- __purge_vmap_area_lazy(ULONG_MAX, 0);
- mutex_unlock(&vmap_purge_lock);
-
- /* Recheck if further work is required. */
- nr_lazy = atomic_long_read(&vmap_lazy_nr);
- } while (nr_lazy > lazy_max_pages());
+ mutex_lock(&vmap_purge_lock);
+ __purge_vmap_area_lazy(ULONG_MAX, 0, false);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -1818,6 +2309,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
{
unsigned long nr_lazy_max = lazy_max_pages();
unsigned long va_start = va->va_start;
+ unsigned int vn_id = decode_vn_id(va->flags);
+ struct vmap_node *vn;
unsigned long nr_lazy;
if (WARN_ON_ONCE(!list_empty(&va->list)))
@@ -1827,12 +2320,15 @@ static void free_vmap_area_noflush(struct vmap_area *va)
PAGE_SHIFT, &vmap_lazy_nr);
/*
- * Merge or place it to the purge tree/list.
+ * If it was request by a certain node we would like to
+ * return it to that node, i.e. its pool for later reuse.
*/
- spin_lock(&purge_vmap_area_lock);
- merge_or_add_vmap_area(va,
- &purge_vmap_area_root, &purge_vmap_area_list);
- spin_unlock(&purge_vmap_area_lock);
+ vn = is_vn_id_valid(vn_id) ?
+ id_to_node(vn_id):addr_to_node(va->va_start);
+
+ spin_lock(&vn->lazy.lock);
+ insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
+ spin_unlock(&vn->lazy.lock);
trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
@@ -1856,26 +2352,65 @@ static void free_unmap_vmap_area(struct vmap_area *va)
struct vmap_area *find_vmap_area(unsigned long addr)
{
+ struct vmap_node *vn;
struct vmap_area *va;
+ int i, j;
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ if (unlikely(!vmap_initialized))
+ return NULL;
- return va;
+ /*
+ * An addr_to_node_id(addr) converts an address to a node index
+ * where a VA is located. If VA spans several zones and passed
+ * addr is not the same as va->va_start, what is not common, we
+ * may need to scan extra nodes. See an example:
+ *
+ * <----va---->
+ * -|-----|-----|-----|-----|-
+ * 1 2 0 1
+ *
+ * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
+ * addr is within 2 or 0 nodes we should do extra work.
+ */
+ i = j = addr_to_node_id(addr);
+ do {
+ vn = &vmap_nodes[i];
+
+ spin_lock(&vn->busy.lock);
+ va = __find_vmap_area(addr, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
+ if (va)
+ return va;
+ } while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+ return NULL;
}
static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
+ struct vmap_node *vn;
struct vmap_area *va;
+ int i, j;
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr, &vmap_area_root);
- if (va)
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ /*
+ * Check the comment in the find_vmap_area() about the loop.
+ */
+ i = j = addr_to_node_id(addr);
+ do {
+ vn = &vmap_nodes[i];
- return va;
+ spin_lock(&vn->busy.lock);
+ va = __find_vmap_area(addr, &vn->busy.root);
+ if (va)
+ unlink_va(va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
+
+ if (va)
+ return va;
+ } while ((i = (i + 1) % nr_vmap_nodes) != j);
+
+ return NULL;
}
/*** Per cpu kva allocator ***/
@@ -2077,6 +2612,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
+ struct vmap_node *vn;
struct vmap_block *tmp;
struct xarray *xa;
@@ -2084,9 +2620,10 @@ static void free_vmap_block(struct vmap_block *vb)
tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
- spin_lock(&vmap_area_lock);
- unlink_va(vb->va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ vn = addr_to_node(vb->va->va_start);
+ spin_lock(&vn->busy.lock);
+ unlink_va(vb->va, &vn->busy.root);
+ spin_unlock(&vn->busy.lock);
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
@@ -2173,7 +2710,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
* get_order(0) returns funny result. Just warn and terminate
* early.
*/
- return NULL;
+ return ERR_PTR(-EINVAL);
}
order = get_order(size);
@@ -2303,7 +2840,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
}
free_purged_blocks(&purge_list);
- if (!__purge_vmap_area_lazy(start, end) && flush)
+ if (!__purge_vmap_area_lazy(start, end, false) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
@@ -2497,47 +3034,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static void vmap_init_free_space(void)
-{
- unsigned long vmap_start = 1;
- const unsigned long vmap_end = ULONG_MAX;
- struct vmap_area *busy, *free;
-
- /*
- * B F B B B F
- * -|-----|.....|-----|-----|-----|.....|-
- * | The KVA space |
- * |<--------------------------------->|
- */
- list_for_each_entry(busy, &vmap_area_list, list) {
- if (busy->va_start - vmap_start > 0) {
- free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
- if (!WARN_ON_ONCE(!free)) {
- free->va_start = vmap_start;
- free->va_end = busy->va_start;
-
- insert_vmap_area_augment(free, NULL,
- &free_vmap_area_root,
- &free_vmap_area_list);
- }
- }
-
- vmap_start = busy->va_end;
- }
-
- if (vmap_end - vmap_start > 0) {
- free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
- if (!WARN_ON_ONCE(!free)) {
- free->va_start = vmap_start;
- free->va_end = vmap_end;
-
- insert_vmap_area_augment(free, NULL,
- &free_vmap_area_root,
- &free_vmap_area_list);
- }
- }
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2551,9 +3047,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
+ struct vmap_node *vn = addr_to_node(va->va_start);
+
+ spin_lock(&vn->busy.lock);
setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&vn->busy.lock);
}
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -2994,7 +3492,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
{
unsigned int nr_allocated = 0;
gfp_t alloc_gfp = gfp;
- bool nofail = false;
+ bool nofail = gfp & __GFP_NOFAIL;
struct page *page;
int i;
@@ -3051,12 +3549,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* and compaction etc.
*/
alloc_gfp &= ~__GFP_NOFAIL;
- nofail = true;
}
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
- if (fatal_signal_pending(current))
+ if (!nofail && fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
@@ -3741,10 +4238,12 @@ finished:
*/
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
+ struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *vm;
char *vaddr;
size_t n, size, flags, remains;
+ unsigned long next;
addr = kasan_reset_tag(addr);
@@ -3754,16 +4253,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
remains = count;
- spin_lock(&vmap_area_lock);
- va = find_vmap_area_exceed_addr((unsigned long)addr);
- if (!va)
+ vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
+ if (!vn)
goto finished_zero;
/* no intersects with alive vmap_area */
if ((unsigned long)addr + remains <= va->va_start)
goto finished_zero;
- list_for_each_entry_from(va, &vmap_area_list, list) {
+ do {
size_t copied;
if (remains == 0)
@@ -3778,10 +4276,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
WARN_ON(flags == VMAP_BLOCK);
if (!vm && !flags)
- continue;
+ goto next_va;
if (vm && (vm->flags & VM_UNINITIALIZED))
- continue;
+ goto next_va;
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
@@ -3790,7 +4288,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
size = vm ? get_vm_area_size(vm) : va_size(va);
if (addr >= vaddr + size)
- continue;
+ goto next_va;
if (addr < vaddr) {
size_t to_zero = min_t(size_t, vaddr - addr, remains);
@@ -3809,9 +4307,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
if (flags & VMAP_RAM)
copied = vmap_ram_vread_iter(iter, addr, n, flags);
- else if (!(vm && (vm->flags & VM_IOREMAP)))
+ else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
copied = aligned_vread_iter(iter, addr, n);
- else /* IOREMAP area is treated as memory hole */
+ else /* IOREMAP | SPARSE area is treated as memory hole */
copied = zero_iter(iter, n);
addr += copied;
@@ -3819,15 +4317,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
if (copied != n)
goto finished;
- }
+
+ next_va:
+ next = va->va_end;
+ spin_unlock(&vn->busy.lock);
+ } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));
finished_zero:
- spin_unlock(&vmap_area_lock);
+ if (vn)
+ spin_unlock(&vn->busy.lock);
+
/* zero-fill memory holes */
return count - remains + zero_iter(iter, remains);
finished:
/* Nothing remains, or We couldn't copy/zero everything. */
- spin_unlock(&vmap_area_lock);
+ if (vn)
+ spin_unlock(&vn->busy.lock);
return count - remains;
}
@@ -4140,9 +4645,8 @@ retry:
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(&free_vmap_area_root,
- &free_vmap_area_list,
- va, start, size);
+ ret = va_clip(&free_vmap_area_root,
+ &free_vmap_area_list, va, start, size);
if (WARN_ON_ONCE(unlikely(ret)))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
@@ -4162,14 +4666,15 @@ retry:
}
/* insert all vm's */
- spin_lock(&vmap_area_lock);
for (area = 0; area < nr_vms; area++) {
- insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+ struct vmap_node *vn = addr_to_node(vas[area]->va_start);
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ spin_unlock(&vn->busy.lock);
}
- spin_unlock(&vmap_area_lock);
/*
* Mark allocated areas as accessible. Do it now as a best-effort
@@ -4278,60 +4783,39 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
- void *objp = (void *)PAGE_ALIGN((unsigned long)object);
const void *caller;
struct vm_struct *vm;
struct vmap_area *va;
+ struct vmap_node *vn;
unsigned long addr;
unsigned int nr_pages;
- if (!spin_trylock(&vmap_area_lock))
+ addr = PAGE_ALIGN((unsigned long) object);
+ vn = addr_to_node(addr);
+
+ if (!spin_trylock(&vn->busy.lock))
return false;
- va = __find_vmap_area((unsigned long)objp, &vmap_area_root);
- if (!va) {
- spin_unlock(&vmap_area_lock);
+
+ va = __find_vmap_area(addr, &vn->busy.root);
+ if (!va || !va->vm) {
+ spin_unlock(&vn->busy.lock);
return false;
}
vm = va->vm;
- if (!vm) {
- spin_unlock(&vmap_area_lock);
- return false;
- }
- addr = (unsigned long)vm->addr;
+ addr = (unsigned long) vm->addr;
caller = vm->caller;
nr_pages = vm->nr_pages;
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&vn->busy.lock);
+
pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
nr_pages, addr, caller);
+
return true;
}
#endif
#ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
- __acquires(&vmap_purge_lock)
- __acquires(&vmap_area_lock)
-{
- mutex_lock(&vmap_purge_lock);
- spin_lock(&vmap_area_lock);
-
- return seq_list_start(&vmap_area_list, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &vmap_area_list, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
- __releases(&vmap_area_lock)
- __releases(&vmap_purge_lock)
-{
- spin_unlock(&vmap_area_lock);
- mutex_unlock(&vmap_purge_lock);
-}
-
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (IS_ENABLED(CONFIG_NUMA)) {
@@ -4358,102 +4842,237 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static void show_purge_info(struct seq_file *m)
{
+ struct vmap_node *vn;
struct vmap_area *va;
+ int i;
- spin_lock(&purge_vmap_area_lock);
- list_for_each_entry(va, &purge_vmap_area_list, list) {
- seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ for (i = 0; i < nr_vmap_nodes; i++) {
+ vn = &vmap_nodes[i];
+
+ spin_lock(&vn->lazy.lock);
+ list_for_each_entry(va, &vn->lazy.head, list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+ spin_unlock(&vn->lazy.lock);
}
- spin_unlock(&purge_vmap_area_lock);
}
-static int s_show(struct seq_file *m, void *p)
+static int vmalloc_info_show(struct seq_file *m, void *p)
{
+ struct vmap_node *vn;
struct vmap_area *va;
struct vm_struct *v;
+ int i;
- va = list_entry(p, struct vmap_area, list);
+ for (i = 0; i < nr_vmap_nodes; i++) {
+ vn = &vmap_nodes[i];
- if (!va->vm) {
- if (va->flags & VMAP_RAM)
- seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
- (void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ spin_lock(&vn->busy.lock);
+ list_for_each_entry(va, &vn->busy.head, list) {
+ if (!va->vm) {
+ if (va->flags & VMAP_RAM)
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
- goto final;
- }
+ continue;
+ }
- v = va->vm;
+ v = va->vm;
- seq_printf(m, "0x%pK-0x%pK %7ld",
- v->addr, v->addr + v->size, v->size);
+ seq_printf(m, "0x%pK-0x%pK %7ld",
+ v->addr, v->addr + v->size, v->size);
- if (v->caller)
- seq_printf(m, " %pS", v->caller);
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
- if (v->nr_pages)
- seq_printf(m, " pages=%d", v->nr_pages);
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
- if (v->phys_addr)
- seq_printf(m, " phys=%pa", &v->phys_addr);
+ if (v->phys_addr)
+ seq_printf(m, " phys=%pa", &v->phys_addr);
- if (v->flags & VM_IOREMAP)
- seq_puts(m, " ioremap");
+ if (v->flags & VM_IOREMAP)
+ seq_puts(m, " ioremap");
- if (v->flags & VM_ALLOC)
- seq_puts(m, " vmalloc");
+ if (v->flags & VM_SPARSE)
+ seq_puts(m, " sparse");
- if (v->flags & VM_MAP)
- seq_puts(m, " vmap");
+ if (v->flags & VM_ALLOC)
+ seq_puts(m, " vmalloc");
- if (v->flags & VM_USERMAP)
- seq_puts(m, " user");
+ if (v->flags & VM_MAP)
+ seq_puts(m, " vmap");
- if (v->flags & VM_DMA_COHERENT)
- seq_puts(m, " dma-coherent");
+ if (v->flags & VM_USERMAP)
+ seq_puts(m, " user");
- if (is_vmalloc_addr(v->pages))
- seq_puts(m, " vpages");
+ if (v->flags & VM_DMA_COHERENT)
+ seq_puts(m, " dma-coherent");
- show_numa_info(m, v);
- seq_putc(m, '\n');
+ if (is_vmalloc_addr(v->pages))
+ seq_puts(m, " vpages");
+
+ show_numa_info(m, v);
+ seq_putc(m, '\n');
+ }
+ spin_unlock(&vn->busy.lock);
+ }
/*
* As a final step, dump "unpurged" areas.
*/
-final:
- if (list_is_last(&va->list, &vmap_area_list))
- show_purge_info(m);
-
+ show_purge_info(m);
return 0;
}
-static const struct seq_operations vmalloc_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
static int __init proc_vmalloc_init(void)
{
+ void *priv_data = NULL;
+
if (IS_ENABLED(CONFIG_NUMA))
- proc_create_seq_private("vmallocinfo", 0400, NULL,
- &vmalloc_op,
- nr_node_ids * sizeof(unsigned int), NULL);
- else
- proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
+ priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+
+ proc_create_single_data("vmallocinfo",
+ 0400, NULL, vmalloc_info_show, priv_data);
+
return 0;
}
module_init(proc_vmalloc_init);
#endif
+static void __init vmap_init_free_space(void)
+{
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *free;
+ struct vm_struct *busy;
+
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ for (busy = vmlist; busy; busy = busy->next) {
+ if ((unsigned long) busy->addr - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = (unsigned long) busy->addr;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = (unsigned long) busy->addr + busy->size;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+}
+
+static void vmap_init_nodes(void)
+{
+ struct vmap_node *vn;
+ int i, n;
+
+#if BITS_PER_LONG == 64
+ /*
+ * A high threshold of max nodes is fixed and bound to 128,
+ * thus a scale factor is 1 for systems where number of cores
+ * are less or equal to specified threshold.
+ *
+ * As for NUMA-aware notes. For bigger systems, for example
+ * NUMA with multi-sockets, where we can end-up with thousands
+ * of cores in total, a "sub-numa-clustering" should be added.
+ *
+ * In this case a NUMA domain is considered as a single entity
+ * with dedicated sub-nodes in it which describe one group or
+ * set of cores. Therefore a per-domain purging is supposed to
+ * be added as well as a per-domain balancing.
+ */
+ n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
+
+ if (n > 1) {
+ vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
+ if (vn) {
+ /* Node partition is 16 pages. */
+ vmap_zone_size = (1 << 4) * PAGE_SIZE;
+ nr_vmap_nodes = n;
+ vmap_nodes = vn;
+ } else {
+ pr_err("Failed to allocate an array. Disable a node layer\n");
+ }
+ }
+#endif
+
+ for (n = 0; n < nr_vmap_nodes; n++) {
+ vn = &vmap_nodes[n];
+ vn->busy.root = RB_ROOT;
+ INIT_LIST_HEAD(&vn->busy.head);
+ spin_lock_init(&vn->busy.lock);
+
+ vn->lazy.root = RB_ROOT;
+ INIT_LIST_HEAD(&vn->lazy.head);
+ spin_lock_init(&vn->lazy.lock);
+
+ for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
+ INIT_LIST_HEAD(&vn->pool[i].head);
+ WRITE_ONCE(vn->pool[i].len, 0);
+ }
+
+ spin_lock_init(&vn->pool_lock);
+ }
+}
+
+static unsigned long
+vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ unsigned long count;
+ struct vmap_node *vn;
+ int i, j;
+
+ for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
+ vn = &vmap_nodes[i];
+
+ for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
+ count += READ_ONCE(vn->pool[j].len);
+ }
+
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int i;
+
+ for (i = 0; i < nr_vmap_nodes; i++)
+ decay_va_pool_node(&vmap_nodes[i], true);
+
+ return SHRINK_STOP;
+}
+
void __init vmalloc_init(void)
{
+ struct shrinker *vmap_node_shrinker;
struct vmap_area *va;
+ struct vmap_node *vn;
struct vm_struct *tmp;
int i;
@@ -4475,6 +5094,11 @@ void __init vmalloc_init(void)
xa_init(&vbq->vmap_blocks);
}
+ /*
+ * Setup nodes before importing vmlist.
+ */
+ vmap_init_nodes();
+
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
@@ -4484,7 +5108,9 @@ void __init vmalloc_init(void)
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+ vn = addr_to_node(va->va_start);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
}
/*
@@ -4492,4 +5118,14 @@ void __init vmalloc_init(void)
*/
vmap_init_free_space();
vmap_initialized = true;
+
+ vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
+ if (!vmap_node_shrinker) {
+ pr_err("Failed to allocate vmap-node shrinker!\n");
+ return;
+ }
+
+ vmap_node_shrinker->count_objects = vmap_node_shrink_count;
+ vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
+ shrinker_register(vmap_node_shrinker);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4255619a1a..3ef654addd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -108,6 +108,12 @@ struct scan_control {
/* Can folios be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Not allow cache_trim_mode to be turned on as part of reclaim? */
+ unsigned int no_cache_trim_mode:1;
+
+ /* Has cache_trim_mode failed at least once? */
+ unsigned int cache_trim_mode_failed:1;
+
/* Proactive reclaim invoked by userspace through memory.reclaim */
unsigned int proactive:1;
@@ -1006,14 +1012,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
struct reclaim_stat *stat, bool ignore_references)
{
+ struct folio_batch free_folios;
LIST_HEAD(ret_folios);
- LIST_HEAD(free_folios);
LIST_HEAD(demote_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
+ folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
do_demote_pass = can_demote(pgdat->node_id, sc);
@@ -1412,14 +1419,14 @@ free_it:
*/
nr_reclaimed += nr_pages;
- /*
- * Is there need to periodically free_folio_list? It would
- * appear not as the counts should be low
- */
- if (unlikely(folio_test_large(folio)))
- destroy_large_folio(folio);
- else
- list_add(&folio->lru, &free_folios);
+ if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
+ if (folio_batch_add(&free_folios, folio) == 0) {
+ mem_cgroup_uncharge_folios(&free_folios);
+ try_to_unmap_flush();
+ free_unref_folios(&free_folios);
+ }
continue;
activate_locked_split:
@@ -1483,9 +1490,9 @@ keep:
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
- mem_cgroup_uncharge_list(&free_folios);
+ mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
- free_unref_page_list(&free_folios);
+ free_unref_folios(&free_folios);
list_splice(&ret_folios, folio_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1744,17 +1751,17 @@ bool folio_isolate_lru(struct folio *folio)
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
bool too_many;
if (current_is_kswapd())
- return 0;
+ return false;
if (!writeback_throttling_sane(sc))
- return 0;
+ return false;
if (file) {
inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
@@ -1783,7 +1790,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
/*
* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
- * On return, @list is reused as a list of folios to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
@@ -1791,8 +1797,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
int nr_pages, nr_moved = 0;
- LIST_HEAD(folios_to_free);
+ struct folio_batch free_folios;
+ folio_batch_init(&free_folios);
while (!list_empty(list)) {
struct folio *folio = lru_to_folio(list);
@@ -1821,12 +1828,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- if (unlikely(folio_test_large(folio))) {
+ if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
+ if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
- destroy_large_folio(folio);
+ mem_cgroup_uncharge_folios(&free_folios);
+ free_unref_folios(&free_folios);
spin_lock_irq(&lruvec->lru_lock);
- } else
- list_add(&folio->lru, &folios_to_free);
+ }
continue;
}
@@ -1843,10 +1853,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
workingset_age_nonresident(lruvec, nr_pages);
}
- /*
- * To save our caller's stack, now use input list for pages to free.
- */
- list_splice(&folios_to_free, list);
+ if (free_folios.nr) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ mem_cgroup_uncharge_folios(&free_folios);
+ free_unref_folios(&free_folios);
+ spin_lock_irq(&lruvec->lru_lock);
+ }
return nr_moved;
}
@@ -1925,8 +1937,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
spin_unlock_irq(&lruvec->lru_lock);
lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
- mem_cgroup_uncharge_list(&folio_list);
- free_unref_page_list(&folio_list);
/*
* If dirty folios are scanned that are not queued for IO, it
@@ -1998,7 +2008,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_inactive);
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
@@ -2067,8 +2077,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_activate = move_folios_to_lru(lruvec, &l_active);
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
- /* Keep all free folios in l_active list */
- list_splice(&l_inactive, &l_active);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
@@ -2078,14 +2086,13 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (nr_rotated)
lru_note_cost(lruvec, file, 0, nr_rotated);
- mem_cgroup_uncharge_list(&l_active);
- free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
static unsigned int reclaim_folio_list(struct list_head *folio_list,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ bool ignore_references)
{
struct reclaim_stat dummy_stat;
unsigned int nr_reclaimed;
@@ -2098,7 +2105,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2108,7 +2115,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
return nr_reclaimed;
}
-unsigned long reclaim_pages(struct list_head *folio_list)
+unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
{
int nid;
unsigned int nr_reclaimed = 0;
@@ -2130,11 +2137,12 @@ unsigned long reclaim_pages(struct list_head *folio_list)
continue;
}
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
+ ignore_references);
nid = folio_nid(lru_to_folio(folio_list));
} while (!list_empty(folio_list));
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2269,7 +2277,8 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
* anonymous pages.
*/
file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
+ !sc->no_cache_trim_mode)
sc->cache_trim_mode = 1;
else
sc->cache_trim_mode = 0;
@@ -2412,7 +2421,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
unsigned long lruvec_size;
unsigned long low, min;
unsigned long scan;
@@ -2879,38 +2888,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
#endif
-static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
{
int i;
int hist;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
- if (walk) {
- hist = lru_hist_from_seq(walk->max_seq);
+ hist = lru_hist_from_seq(walk->seq);
- for (i = 0; i < NR_MM_STATS; i++) {
- WRITE_ONCE(mm_state->stats[hist][i],
- mm_state->stats[hist][i] + walk->mm_stats[i]);
- walk->mm_stats[i] = 0;
- }
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(mm_state->stats[hist][i],
+ mm_state->stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
}
if (NR_HIST_GENS > 1 && last) {
- hist = lru_hist_from_seq(mm_state->seq + 1);
+ hist = lru_hist_from_seq(walk->seq + 1);
for (i = 0; i < NR_MM_STATS; i++)
WRITE_ONCE(mm_state->stats[hist][i], 0);
}
}
-static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
- struct mm_struct **iter)
+static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
{
bool first = false;
bool last = false;
struct mm_struct *mm = NULL;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
@@ -2927,9 +2935,9 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
*/
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
- if (walk->max_seq <= mm_state->seq)
+ if (walk->seq <= mm_state->seq)
goto done;
if (!mm_state->head)
@@ -2954,12 +2962,12 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
} while (!(mm = get_next_mm(walk)));
done:
if (*iter || last)
- reset_mm_stats(lruvec, walk, last);
+ reset_mm_stats(walk, last);
spin_unlock(&mm_list->lock);
if (mm && first)
- reset_bloom_filter(mm_state, walk->max_seq + 1);
+ reset_bloom_filter(mm_state, walk->seq + 1);
if (*iter)
mmput_async(*iter);
@@ -2969,7 +2977,7 @@ done:
return last;
}
-static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
{
bool success = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -2978,13 +2986,12 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
spin_lock(&mm_list->lock);
- VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
- if (max_seq > mm_state->seq) {
+ if (seq > mm_state->seq) {
mm_state->head = NULL;
mm_state->tail = NULL;
WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
- reset_mm_stats(lruvec, NULL, true);
success = true;
}
@@ -3159,9 +3166,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
walk->nr_pages[new_gen][type][zone] += delta;
}
-static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+static void reset_batch_size(struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
+ struct lruvec *lruvec = walk->lruvec;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3331,7 +3339,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
if (!pte)
@@ -3398,7 +3407,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+ DEFINE_MAX_SEQ(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3529,7 +3539,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
- if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
+ if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
continue;
walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3540,7 +3550,7 @@ restart:
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
- update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
+ update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
}
walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3591,7 +3601,7 @@ done:
return -EAGAIN;
}
-static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
{
static const struct mm_walk_ops mm_walk_ops = {
.test_walk = should_skip_vma,
@@ -3600,6 +3610,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
};
int err;
+ struct lruvec *lruvec = walk->lruvec;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
walk->next_addr = FIRST_USER_ADDRESS;
@@ -3610,7 +3621,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
err = -EBUSY;
/* another thread might have called inc_max_seq() */
- if (walk->max_seq != max_seq)
+ if (walk->seq != max_seq)
break;
/* folio_update_gen() requires stable folio_memcg() */
@@ -3628,7 +3639,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
if (walk->batched) {
spin_lock_irq(&lruvec->lru_lock);
- reset_batch_size(lruvec, walk);
+ reset_batch_size(walk);
spin_unlock_irq(&lruvec->lru_lock);
}
@@ -3747,7 +3758,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
bool success;
@@ -3755,14 +3766,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
- if (max_seq < READ_ONCE(lrugen->max_seq))
+ if (seq < READ_ONCE(lrugen->max_seq))
return false;
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
- success = max_seq == lrugen->max_seq;
+ success = seq == lrugen->max_seq;
if (!success)
goto unlock;
@@ -3815,8 +3826,8 @@ unlock:
return success;
}
-static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, bool force_scan)
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
+ bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3824,13 +3835,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
- VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, can_swap, force_scan);
/* see the comment in iterate_mm_list() */
- if (max_seq <= READ_ONCE(mm_state->seq))
+ if (seq <= READ_ONCE(mm_state->seq))
return false;
/*
@@ -3840,29 +3851,29 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* is less efficient, but it avoids bursty page faults.
*/
if (!should_walk_mmu()) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk = set_mm_walk(NULL, true);
if (!walk) {
- success = iterate_mm_list_nowalk(lruvec, max_seq);
+ success = iterate_mm_list_nowalk(lruvec, seq);
goto done;
}
walk->lruvec = lruvec;
- walk->max_seq = max_seq;
+ walk->seq = seq;
walk->can_swap = can_swap;
walk->force_scan = force_scan;
do {
- success = iterate_mm_list(lruvec, walk, &mm);
+ success = iterate_mm_list(walk, &mm);
if (mm)
- walk_mm(lruvec, mm, walk);
+ walk_mm(mm, walk);
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, can_swap, force_scan);
WARN_ON_ONCE(!success);
}
@@ -4287,7 +4298,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* swapping inhibited */
+ /* swap constrained */
if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
@@ -4456,9 +4467,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
DEFINE_MIN_SEQ(lruvec);
/*
- * Try to make the obvious choice first. When anon and file are both
- * available from the same generation, interpret swappiness 1 as file
- * first and 200 as anon first.
+ * Try to make the obvious choice first, and if anon and file are both
+ * available from the same generation,
+ * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
+ * first.
+ * 2. If !__GFP_IO, file first since clean pagecache is more likely to
+ * exist than clean swapcache.
*/
if (!swappiness)
type = LRU_GEN_FILE;
@@ -4468,6 +4482,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
type = LRU_GEN_FILE;
else if (swappiness == 200)
type = LRU_GEN_ANON;
+ else if (!(sc->gfp_mask & __GFP_IO))
+ type = LRU_GEN_FILE;
else
type = get_type_to_scan(lruvec, swappiness, &tier);
@@ -4558,8 +4574,10 @@ retry:
move_folios_to_lru(lruvec, &list);
walk = current->reclaim_state->mm_walk;
- if (walk && walk->batched)
- reset_batch_size(lruvec, walk);
+ if (walk && walk->batched) {
+ walk->lruvec = lruvec;
+ reset_batch_size(walk);
+ }
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -4569,10 +4587,6 @@ retry:
spin_unlock_irq(&lruvec->lru_lock);
- mem_cgroup_uncharge_list(&list);
- free_unref_page_list(&list);
-
- INIT_LIST_HEAD(&list);
list_splice_init(&clean, &list);
if (!list_empty(&list)) {
@@ -4584,14 +4598,13 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+ bool can_swap, unsigned long *nr_to_scan)
{
int gen, type, zone;
unsigned long old = 0;
unsigned long young = 0;
unsigned long total = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
/* whether this lruvec is completely out of cold folios */
@@ -4619,13 +4632,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
}
}
- /* try to scrape all its memory if this memcg was deleted */
- if (!mem_cgroup_online(memcg)) {
- *nr_to_scan = total;
- return false;
- }
-
- *nr_to_scan = total >> sc->priority;
+ *nr_to_scan = total;
/*
* The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4657,6 +4664,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
*/
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
+ bool success;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4664,15 +4672,18 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
- return nr_to_scan;
+ success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
- /* skip the aging path at the default priority */
- if (sc->priority == DEF_PRIORITY)
+ /* try to scrape all its memory if this memcg was deleted */
+ if (nr_to_scan && !mem_cgroup_online(memcg))
return nr_to_scan;
- /* skip this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ /* try to get away with not aging at the default priority */
+ if (!success || sc->priority == DEF_PRIORITY)
+ return nr_to_scan >> sc->priority;
+
+ /* stop scanning this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -4712,10 +4723,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long scanned = 0;
int swappiness = get_swappiness(lruvec, sc);
- /* clean file folios are more likely to exist */
- if (swappiness && !(sc->gfp_mask & __GFP_IO))
- swappiness = 1;
-
while (true) {
int delta;
@@ -4878,7 +4885,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
{
int priority;
unsigned long reclaimable;
- struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
@@ -4888,7 +4894,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_swappiness(lruvec, sc))
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
@@ -5332,7 +5338,7 @@ static const struct seq_operations lru_gen_seq_ops = {
.show = lru_gen_seq_show,
};
-static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+static int run_aging(struct lruvec *lruvec, unsigned long seq,
bool can_swap, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
@@ -5347,7 +5353,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
return -ERANGE;
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+ try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
return 0;
}
@@ -5415,7 +5421,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
switch (cmd) {
case '+':
- err = run_aging(lruvec, seq, sc, swappiness, opt);
+ err = run_aging(lruvec, seq, swappiness, opt);
break;
case '-':
err = run_eviction(lruvec, seq, sc, swappiness, opt);
@@ -5987,6 +5993,8 @@ again:
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
+ else if (sc->cache_trim_mode)
+ sc->cache_trim_mode_failed = 1;
}
/*
@@ -6799,6 +6807,7 @@ restart:
bool raise_priority = true;
bool balanced;
bool ret;
+ bool was_frozen;
sc.reclaim_idx = highest_zoneidx;
@@ -6897,9 +6906,9 @@ restart:
/* Check if kswapd should be suspending */
__fs_reclaim_release(_THIS_IP_);
- ret = try_to_freeze();
+ ret = kthread_freezable_should_stop(&was_frozen);
__fs_reclaim_acquire(_THIS_IP_);
- if (ret || kthread_should_stop())
+ if (was_frozen || ret)
break;
/*
@@ -6921,6 +6930,16 @@ restart:
sc.priority--;
} while (sc.priority >= 1);
+ /*
+ * Restart only if it went through the priority loop all the way,
+ * but cache_trim_mode didn't work.
+ */
+ if (!sc.nr_reclaimed && sc.priority < 1 &&
+ !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
+ sc.no_cache_trim_mode = 1;
+ goto restart;
+ }
+
if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;
@@ -7105,7 +7124,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
- bool ret;
+ bool was_frozen;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
@@ -7122,15 +7141,14 @@ kswapd_try_sleep:
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
- ret = try_to_freeze();
- if (kthread_should_stop())
+ if (kthread_freezable_should_stop(&was_frozen))
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (ret)
+ if (was_frozen)
continue;
/*
diff --git a/mm/workingset.c b/mm/workingset.c
index 2260129743..f2a0ecaf70 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "internal.h"
/*
* Double CLOCK lists
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7c76b396b7..7ab0562105 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -364,8 +364,9 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
}
/*
- * Encodes the handle of a particular buddy within a z3fold page
- * Pool lock should be held as this function accesses first_num
+ * Encodes the handle of a particular buddy within a z3fold page.
+ * Zhdr->page_lock should be held as this function accesses first_num
+ * if bud != HEADLESS.
*/
static unsigned long __encode_handle(struct z3fold_header *zhdr,
struct z3fold_buddy_slots *slots,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c937635e0a..7d7cb3eaab 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -110,13 +110,12 @@
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define HUGE_BITS 1
#define FULLNESS_BITS 4
#define CLASS_BITS 8
-#define ISOLATED_BITS 5
#define MAGIC_VAL_BITS 8
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
@@ -246,7 +245,6 @@ struct zspage {
unsigned int huge:HUGE_BITS;
unsigned int fullness:FULLNESS_BITS;
unsigned int class:CLASS_BITS + 1;
- unsigned int isolated:ISOLATED_BITS;
unsigned int magic:MAGIC_VAL_BITS;
};
unsigned int inuse;
@@ -278,18 +276,14 @@ static bool ZsHugePage(struct zspage *zspage)
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
-
-#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_lock_nested(struct zspage *zspage);
static void migrate_write_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static void migrate_write_lock(struct zspage *zspage) {}
-static void migrate_write_lock_nested(struct zspage *zspage) {}
-static void migrate_write_unlock(struct zspage *zspage) {}
static void kick_deferred_free(struct zs_pool *pool) {}
static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
@@ -476,30 +470,12 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
zspage->freeobj = obj;
}
-static void get_zspage_mapping(struct zspage *zspage,
- unsigned int *class_idx,
- int *fullness)
-{
- BUG_ON(zspage->magic != ZSPAGE_MAGIC);
-
- *fullness = zspage->fullness;
- *class_idx = zspage->class;
-}
-
static struct size_class *zspage_class(struct zs_pool *pool,
struct zspage *zspage)
{
return pool->size_class[zspage->class];
}
-static void set_zspage_mapping(struct zspage *zspage,
- unsigned int class_idx,
- int fullness)
-{
- zspage->class = class_idx;
- zspage->fullness = fullness;
-}
-
/*
* zsmalloc divides the pool into various size classes where each
* class maintains a list of zspages where each zspage is divided
@@ -694,16 +670,17 @@ static void insert_zspage(struct size_class *class,
{
class_stat_inc(class, fullness, 1);
list_add(&zspage->list, &class->fullness_list[fullness]);
+ zspage->fullness = fullness;
}
/*
* This function removes the given zspage from the freelist identified
* by <class, fullness_group>.
*/
-static void remove_zspage(struct size_class *class,
- struct zspage *zspage,
- int fullness)
+static void remove_zspage(struct size_class *class, struct zspage *zspage)
{
+ int fullness = zspage->fullness;
+
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
list_del_init(&zspage->list);
@@ -721,17 +698,14 @@ static void remove_zspage(struct size_class *class,
*/
static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
{
- int class_idx;
- int currfg, newfg;
+ int newfg;
- get_zspage_mapping(zspage, &class_idx, &currfg);
newfg = get_fullness_group(class, zspage);
- if (newfg == currfg)
+ if (newfg == zspage->fullness)
goto out;
- remove_zspage(class, zspage, currfg);
+ remove_zspage(class, zspage);
insert_zspage(class, zspage, newfg);
- set_zspage_mapping(zspage, class_idx, newfg);
out:
return newfg;
}
@@ -763,14 +737,12 @@ static struct page *get_next_page(struct page *page)
static void obj_to_location(unsigned long obj, struct page **page,
unsigned int *obj_idx)
{
- obj >>= OBJ_TAG_BITS;
*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
*obj_idx = (obj & OBJ_INDEX_MASK);
}
static void obj_to_page(unsigned long obj, struct page **page)
{
- obj >>= OBJ_TAG_BITS;
*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
}
@@ -785,7 +757,6 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
obj = page_to_pfn(page) << OBJ_INDEX_BITS;
obj |= obj_idx & OBJ_INDEX_MASK;
- obj <<= OBJ_TAG_BITS;
return obj;
}
@@ -849,15 +820,11 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct page *page, *next;
- int fg;
- unsigned int class_idx;
-
- get_zspage_mapping(zspage, &class_idx, &fg);
assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
- VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
+ VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
next = page = get_first_page(zspage);
do {
@@ -892,7 +859,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
return;
}
- remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
+ remove_zspage(class, zspage);
__free_zspage(pool, class, zspage);
}
@@ -1011,6 +978,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
create_page_chain(class, zspage, pages);
init_zspage(class, zspage);
zspage->pool = pool;
+ zspage->class = class->index;
return zspage;
}
@@ -1403,7 +1371,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
obj = obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
- set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
@@ -1623,7 +1590,7 @@ static struct zspage *isolate_src_zspage(struct size_class *class)
zspage = list_first_entry_or_null(&class->fullness_list[fg],
struct zspage, list);
if (zspage) {
- remove_zspage(class, zspage, fg);
+ remove_zspage(class, zspage);
return zspage;
}
}
@@ -1640,7 +1607,7 @@ static struct zspage *isolate_dst_zspage(struct size_class *class)
zspage = list_first_entry_or_null(&class->fullness_list[fg],
struct zspage, list);
if (zspage) {
- remove_zspage(class, zspage, fg);
+ remove_zspage(class, zspage);
return zspage;
}
}
@@ -1661,7 +1628,6 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
fullness = get_fullness_group(class, zspage);
insert_zspage(class, zspage, fullness);
- set_zspage_mapping(zspage, class->index, fullness);
return fullness;
}
@@ -1725,33 +1691,17 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
-#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
}
-static void migrate_write_lock_nested(struct zspage *zspage)
-{
- write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING);
-}
-
static void migrate_write_unlock(struct zspage *zspage)
{
write_unlock(&zspage->lock);
}
-/* Number of isolated subpage for *page migration* in this zspage */
-static void inc_zspage_isolation(struct zspage *zspage)
-{
- zspage->isolated++;
-}
-
-static void dec_zspage_isolation(struct zspage *zspage)
-{
- VM_BUG_ON(zspage->isolated == 0);
- zspage->isolated--;
-}
+#ifdef CONFIG_COMPACTION
static const struct movable_operations zsmalloc_mops;
@@ -1780,21 +1730,12 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
- struct zs_pool *pool;
- struct zspage *zspage;
-
/*
* Page is locked so zspage couldn't be destroyed. For detail, look at
* lock_zspage in free_zspage.
*/
VM_BUG_ON_PAGE(PageIsolated(page), page);
- zspage = get_zspage(page);
- pool = zspage->pool;
- spin_lock(&pool->lock);
- inc_zspage_isolation(zspage);
- spin_unlock(&pool->lock);
-
return true;
}
@@ -1859,7 +1800,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
- dec_zspage_isolation(zspage);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release the pool's lock.
@@ -1881,16 +1821,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
static void zs_page_putback(struct page *page)
{
- struct zs_pool *pool;
- struct zspage *zspage;
-
VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
- zspage = get_zspage(page);
- pool = zspage->pool;
- spin_lock(&pool->lock);
- dec_zspage_isolation(zspage);
- spin_unlock(&pool->lock);
}
static const struct movable_operations zsmalloc_mops = {
@@ -1907,8 +1838,6 @@ static void async_free_zspage(struct work_struct *work)
{
int i;
struct size_class *class;
- unsigned int class_idx;
- int fullness;
struct zspage *zspage, *tmp;
LIST_HEAD(free_pages);
struct zs_pool *pool = container_of(work, struct zs_pool,
@@ -1929,10 +1858,8 @@ static void async_free_zspage(struct work_struct *work)
list_del(&zspage->list);
lock_zspage(zspage);
- get_zspage_mapping(zspage, &class_idx, &fullness);
- VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
- class = pool->size_class[class_idx];
spin_lock(&pool->lock);
+ class = zspage_class(pool, zspage);
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
}
@@ -2006,19 +1933,17 @@ static unsigned long __zs_compact(struct zs_pool *pool,
dst_zspage = isolate_dst_zspage(class);
if (!dst_zspage)
break;
- migrate_write_lock(dst_zspage);
}
src_zspage = isolate_src_zspage(class);
if (!src_zspage)
break;
- migrate_write_lock_nested(src_zspage);
-
+ migrate_write_lock(src_zspage);
migrate_zspage(pool, src_zspage, dst_zspage);
- fg = putback_zspage(class, src_zspage);
migrate_write_unlock(src_zspage);
+ fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
free_zspage(pool, class, src_zspage);
pages_freed += class->pages_per_zspage;
@@ -2028,7 +1953,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
|| spin_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
- migrate_write_unlock(dst_zspage);
dst_zspage = NULL;
spin_unlock(&pool->lock);
@@ -2037,15 +1961,12 @@ static unsigned long __zs_compact(struct zs_pool *pool,
}
}
- if (src_zspage) {
+ if (src_zspage)
putback_zspage(class, src_zspage);
- migrate_write_unlock(src_zspage);
- }
- if (dst_zspage) {
+ if (dst_zspage)
putback_zspage(class, dst_zspage);
- migrate_write_unlock(dst_zspage);
- }
+
spin_unlock(&pool->lock);
return pages_freed;
diff --git a/mm/zswap.c b/mm/zswap.c
index 69766f2c5a..6f8850c44b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor;
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
static u64 zswap_reject_kmemcache_fail;
-/* Duplicate store was encountered (rare) */
-static u64 zswap_duplicate_entry;
/* Shrinker work queue */
static struct workqueue_struct *shrink_wq;
@@ -141,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);
-static bool zswap_exclusive_loads_enabled = IS_ENABLED(
- CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
-module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
-
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
@@ -168,6 +162,7 @@ struct crypto_acomp_ctx {
struct crypto_wait wait;
u8 *buffer;
struct mutex mutex;
+ bool is_sleepable;
};
/*
@@ -179,18 +174,24 @@ struct crypto_acomp_ctx {
struct zswap_pool {
struct zpool *zpools[ZSWAP_NR_ZPOOLS];
struct crypto_acomp_ctx __percpu *acomp_ctx;
- struct kref kref;
+ struct percpu_ref ref;
struct list_head list;
struct work_struct release_work;
- struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
- struct list_lru list_lru;
- struct mem_cgroup *next_shrink;
- struct shrinker *shrinker;
- atomic_t nr_stored;
};
+/* Global LRU lists shared by all zswap pools. */
+static struct list_lru zswap_list_lru;
+/* counter of pages stored in all zswap pools. */
+static atomic_t zswap_nr_stored = ATOMIC_INIT(0);
+
+/* The lock protects zswap_next_shrink updates. */
+static DEFINE_SPINLOCK(zswap_shrink_lock);
+static struct mem_cgroup *zswap_next_shrink;
+static struct work_struct zswap_shrink_work;
+static struct shrinker *zswap_shrinker;
+
/*
* struct zswap_entry
*
@@ -199,12 +200,6 @@ struct zswap_pool {
*
* rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
- * refcount - the number of outstanding reference to the entry. This is needed
- * to protect against premature freeing of the entry by code
- * concurrent calls to load, invalidate, and writeback. The lock
- * for the zswap_tree structure that contains the entry must
- * be held while changing the refcount. Since the lock must
- * be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
* pool and lru are invalid and must be ignored.
@@ -217,7 +212,6 @@ struct zswap_pool {
struct zswap_entry {
struct rb_node rbnode;
swp_entry_t swpentry;
- int refcount;
unsigned int length;
struct zswap_pool *pool;
union {
@@ -228,17 +222,13 @@ struct zswap_entry {
struct list_head lru;
};
-/*
- * The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
- * - the refcount field of each entry in the tree
- */
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
static LIST_HEAD(zswap_pools);
@@ -265,15 +255,16 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
+static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+{
+ return &zswap_trees[swp_type(swp)][swp_offset(swp)
+ >> SWAP_ADDRESS_SPACE_SHIFT];
+}
+
#define zswap_pool_debug(msg, p) \
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
-static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree);
-static int zswap_pool_get(struct zswap_pool *pool);
-static void zswap_pool_put(struct zswap_pool *pool);
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -313,717 +304,10 @@ static void zswap_update_total_size(void)
zswap_pool_total_size = total;
}
-/* should be called under RCU */
-#ifdef CONFIG_MEMCG
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
-}
-#else
-static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
-{
- return NULL;
-}
-#endif
-
-static inline int entry_to_nid(struct zswap_entry *entry)
-{
- return page_to_nid(virt_to_page(entry));
-}
-
-void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
-
- /* lock out zswap pools list modification */
- spin_lock(&zswap_pools_lock);
- list_for_each_entry(pool, &zswap_pools, list) {
- if (pool->next_shrink == memcg)
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- }
- spin_unlock(&zswap_pools_lock);
-}
-
-/*********************************
-* zswap entry functions
-**********************************/
-static struct kmem_cache *zswap_entry_cache;
-
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
-{
- struct zswap_entry *entry;
- entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
- if (!entry)
- return NULL;
- entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
- return entry;
-}
-
-static void zswap_entry_cache_free(struct zswap_entry *entry)
-{
- kmem_cache_free(zswap_entry_cache, entry);
-}
-
-/*********************************
-* zswap lruvec functions
-**********************************/
-void zswap_lruvec_state_init(struct lruvec *lruvec)
-{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
-}
-
-void zswap_folio_swapin(struct folio *folio)
-{
- struct lruvec *lruvec;
-
- VM_WARN_ON_ONCE(!folio_test_locked(folio));
- lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-}
-
-/*********************************
-* lru functions
-**********************************/
-static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- /*
- * Note that it is safe to use rcu_read_lock() here, even in the face of
- * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
- * used in list_lru lookup, only two scenarios are possible:
- *
- * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
- * new entry will be reparented to memcg's parent's list_lru.
- * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
- * new entry will be added directly to memcg's parent's list_lru.
- *
- * Similar reasoning holds for list_lru_del() and list_lru_putback().
- */
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
- rcu_read_unlock();
-}
-
-static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- /* will always succeed */
- list_lru_del(list_lru, &entry->lru, nid, memcg);
- rcu_read_unlock();
-}
-
-static void zswap_lru_putback(struct list_lru *list_lru,
- struct zswap_entry *entry)
-{
- int nid = entry_to_nid(entry);
- spinlock_t *lock = &list_lru->node[nid].lock;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_entry(entry);
- spin_lock(lock);
- /* we cannot use list_lru_add here, because it increments node's lru count */
- list_lru_putback(list_lru, &entry->lru, nid, memcg);
- spin_unlock(lock);
-
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
- /* increment the protection area to account for the LRU rotation. */
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- rcu_read_unlock();
-}
-
-/*********************************
-* rbtree functions
-**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
-{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
-}
-
-static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
- return true;
- }
- return false;
-}
-
-static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
-{
- int i = 0;
-
- if (ZSWAP_NR_ZPOOLS > 1)
- i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
-
- return entry->pool->zpools[i];
-}
-
-/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_entry *entry)
-{
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zpool_free(zswap_find_zpool(entry), entry->handle);
- atomic_dec(&entry->pool->nr_stored);
- zswap_pool_put(entry->pool);
- }
- if (entry->objcg) {
- obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
- obj_cgroup_put(entry->objcg);
- }
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
-}
-
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
- entry->refcount++;
-}
-
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
-static void zswap_entry_put(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- int refcount = --entry->refcount;
-
- WARN_ON_ONCE(refcount < 0);
- if (refcount == 0) {
- WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
- zswap_free_entry(entry);
- }
-}
-
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
- pgoff_t offset)
-{
- struct zswap_entry *entry;
-
- entry = zswap_rb_search(root, offset);
- if (entry)
- zswap_entry_get(entry);
-
- return entry;
-}
-
-/*********************************
-* shrinker functions
-**********************************/
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg);
-
-static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
- struct zswap_pool *pool = shrinker->private_data;
- bool encountered_page_in_swapcache = false;
-
- if (!zswap_shrinker_enabled ||
- !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&pool->list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
- shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
- &encountered_page_in_swapcache);
-
- if (encountered_page_in_swapcache)
- return SHRINK_STOP;
-
- return shrink_ret ? shrink_ret : SHRINK_STOP;
-}
-
-static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- struct zswap_pool *pool = shrinker->private_data;
- struct mem_cgroup *memcg = sc->memcg;
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
-
- if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
- return 0;
-
- /*
- * The shrinker resumes swap writeback, which will enter block
- * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS
- * rules (may_enter_fs()), which apply on a per-folio basis.
- */
- if (!gfp_has_io_fs(sc->gfp_mask))
- return 0;
-
- /*
- * For memcg, use the cgroup-wide ZSWAP stats since we don't
- * have them per-node and thus per-lruvec. Careful if memcg is
- * runtime-disabled: we can get sc->memcg == NULL, which is ok
- * for the lruvec, but not for memcg_page_state().
- *
- * Without memcg, use the zswap pool-wide metrics.
- */
- if (!mem_cgroup_disabled()) {
- mem_cgroup_flush_stats(memcg);
- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
- } else {
- nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
- nr_stored = atomic_read(&pool->nr_stored);
- }
-
- if (!nr_stored)
- return 0;
-
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
- /*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
- */
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
-
- /*
- * Scale the number of freeable pages by the memory saving factor.
- * This ensures that the better zswap compresses memory, the fewer
- * pages we will evict to swap (as it will otherwise incur IO for
- * relatively small memory saving).
- */
- return mult_frac(nr_freeable, nr_backing, nr_stored);
-}
-
-static void zswap_alloc_shrinker(struct zswap_pool *pool)
-{
- pool->shrinker =
- shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
- if (!pool->shrinker)
- return;
-
- pool->shrinker->private_data = pool;
- pool->shrinker->scan_objects = zswap_shrinker_scan;
- pool->shrinker->count_objects = zswap_shrinker_count;
- pool->shrinker->batch = 0;
- pool->shrinker->seeks = DEFAULT_SEEKS;
-}
-
-/*********************************
-* per-cpu code
-**********************************/
-static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
- int ret;
-
- mutex_init(&acomp_ctx->mutex);
-
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
-
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
- pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
- ret = PTR_ERR(acomp);
- goto acomp_fail;
- }
- acomp_ctx->acomp = acomp;
-
- req = acomp_request_alloc(acomp_ctx->acomp);
- if (!req) {
- pr_err("could not alloc crypto acomp_request %s\n",
- pool->tfm_name);
- ret = -ENOMEM;
- goto req_fail;
- }
- acomp_ctx->req = req;
-
- crypto_init_wait(&acomp_ctx->wait);
- /*
- * if the backend of acomp is async zip, crypto_req_done() will wakeup
- * crypto_wait_req(); if the backend of acomp is scomp, the callback
- * won't be called, crypto_wait_req() will return without blocking.
- */
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &acomp_ctx->wait);
-
- return 0;
-
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
- return ret;
-}
-
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
-
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
-
- return 0;
-}
-
/*********************************
* pool functions
**********************************/
-
-static struct zswap_pool *__zswap_pool_current(void)
-{
- struct zswap_pool *pool;
-
- pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
- WARN_ONCE(!pool && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_current(void)
-{
- assert_spin_locked(&zswap_pools_lock);
-
- return __zswap_pool_current();
-}
-
-static struct zswap_pool *zswap_pool_current_get(void)
-{
- struct zswap_pool *pool;
-
- rcu_read_lock();
-
- pool = __zswap_pool_current();
- if (!zswap_pool_get(pool))
- pool = NULL;
-
- rcu_read_unlock();
-
- return pool;
-}
-
-static struct zswap_pool *zswap_pool_last_get(void)
-{
- struct zswap_pool *pool, *last = NULL;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- last = pool;
- WARN_ONCE(!last && zswap_has_pool,
- "%s: no page storage pool!\n", __func__);
- if (!zswap_pool_get(last))
- last = NULL;
-
- rcu_read_unlock();
-
- return last;
-}
-
-/* type and compressor must be null-terminated */
-static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
-{
- struct zswap_pool *pool;
-
- assert_spin_locked(&zswap_pools_lock);
-
- list_for_each_entry_rcu(pool, &zswap_pools, list) {
- if (strcmp(pool->tfm_name, compressor))
- continue;
- /* all zpools share the same type */
- if (strcmp(zpool_get_type(pool->zpools[0]), type))
- continue;
- /* if we can't get it, it's about to be destroyed */
- if (!zswap_pool_get(pool))
- continue;
- return pool;
- }
-
- return NULL;
-}
-
-/*
- * If the entry is still valid in the tree, drop the initial ref and remove it
- * from the tree. This function must be called with an additional ref held,
- * otherwise it may race with another invalidation freeing the entry.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- if (zswap_rb_erase(&tree->rbroot, entry))
- zswap_entry_put(tree, entry);
-}
-
-static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
- spinlock_t *lock, void *arg)
-{
- struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
- bool *encountered_page_in_swapcache = (bool *)arg;
- struct zswap_tree *tree;
- pgoff_t swpoffset;
- enum lru_status ret = LRU_REMOVED_RETRY;
- int writeback_result;
-
- /*
- * Once the lru lock is dropped, the entry might get freed. The
- * swpoffset is copied to the stack, and entry isn't deref'd again
- * until the entry is verified to still be alive in the tree.
- */
- swpoffset = swp_offset(entry->swpentry);
- tree = zswap_trees[swp_type(entry->swpentry)];
- list_lru_isolate(l, item);
- /*
- * It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
- */
- spin_unlock(lock);
-
- /* Check for invalidate() race */
- spin_lock(&tree->lock);
- if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
- goto unlock;
-
- /* Hold a reference to prevent a free during writeback */
- zswap_entry_get(entry);
- spin_unlock(&tree->lock);
-
- writeback_result = zswap_writeback_entry(entry, tree);
-
- spin_lock(&tree->lock);
- if (writeback_result) {
- zswap_reject_reclaim_fail++;
- zswap_lru_putback(&entry->pool->list_lru, entry);
- ret = LRU_RETRY;
-
- /*
- * Encountering a page already in swap cache is a sign that we are shrinking
- * into the warmer region. We should terminate shrinking (if we're in the dynamic
- * shrinker context).
- */
- if (writeback_result == -EEXIST && encountered_page_in_swapcache)
- *encountered_page_in_swapcache = true;
-
- goto put_unlock;
- }
- zswap_written_back_pages++;
-
- if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPWB);
-
- count_vm_event(ZSWPWB);
- /*
- * Writeback started successfully, the page now belongs to the
- * swapcache. Drop the entry from zswap - unless invalidate already
- * took it out while we had the tree->lock released for IO.
- */
- zswap_invalidate_entry(tree, entry);
-
-put_unlock:
- /* Drop local reference */
- zswap_entry_put(tree, entry);
-unlock:
- spin_unlock(&tree->lock);
- spin_lock(lock);
- return ret;
-}
-
-static int shrink_memcg(struct mem_cgroup *memcg)
-{
- struct zswap_pool *pool;
- int nid, shrunk = 0;
-
- if (!mem_cgroup_zswap_writeback_enabled(memcg))
- return -EINVAL;
-
- /*
- * Skip zombies because their LRUs are reparented and we would be
- * reclaiming from the parent instead of the dead memcg.
- */
- if (memcg && !mem_cgroup_online(memcg))
- return -ENOENT;
-
- pool = zswap_pool_current_get();
- if (!pool)
- return -EINVAL;
-
- for_each_node_state(nid, N_NORMAL_MEMORY) {
- unsigned long nr_to_walk = 1;
-
- shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
- &shrink_memcg_cb, NULL, &nr_to_walk);
- }
- zswap_pool_put(pool);
- return shrunk ? 0 : -EAGAIN;
-}
-
-static void shrink_worker(struct work_struct *w)
-{
- struct zswap_pool *pool = container_of(w, typeof(*pool),
- shrink_work);
- struct mem_cgroup *memcg;
- int ret, failures = 0;
-
- /* global reclaim will select cgroup in a round-robin fashion. */
- do {
- spin_lock(&zswap_pools_lock);
- pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
- memcg = pool->next_shrink;
-
- /*
- * We need to retry if we have gone through a full round trip, or if we
- * got an offline memcg (or else we risk undoing the effect of the
- * zswap memcg offlining cleanup callback). This is not catastrophic
- * per se, but it will keep the now offlined memcg hostage for a while.
- *
- * Note that if we got an online memcg, we will keep the extra
- * reference in case the original reference obtained by mem_cgroup_iter
- * is dropped by the zswap memcg offlining callback, ensuring that the
- * memcg is not killed when we are reclaiming.
- */
- if (!memcg) {
- spin_unlock(&zswap_pools_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
-
- if (!mem_cgroup_tryget_online(memcg)) {
- /* drop the reference from mem_cgroup_iter() */
- mem_cgroup_iter_break(NULL, memcg);
- pool->next_shrink = NULL;
- spin_unlock(&zswap_pools_lock);
-
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
- spin_unlock(&zswap_pools_lock);
-
- ret = shrink_memcg(memcg);
- /* drop the extra reference */
- mem_cgroup_put(memcg);
-
- if (ret == -EINVAL)
- break;
- if (ret && ++failures == MAX_RECLAIM_RETRIES)
- break;
-
-resched:
- cond_resched();
- } while (!zswap_can_accept());
- zswap_pool_put(pool);
-}
+static void __zswap_pool_empty(struct percpu_ref *ref);
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
@@ -1074,30 +358,21 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
if (ret)
goto error;
- zswap_alloc_shrinker(pool);
- if (!pool->shrinker)
- goto error;
-
- pr_debug("using %s compressor\n", pool->tfm_name);
-
/* being the current pool takes 1 ref; this func expects the
* caller to always add the new pool as the current pool
*/
- kref_init(&pool->kref);
+ ret = percpu_ref_init(&pool->ref, __zswap_pool_empty,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+ if (ret)
+ goto ref_fail;
INIT_LIST_HEAD(&pool->list);
- if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
- goto lru_fail;
- shrinker_register(pool->shrinker);
- INIT_WORK(&pool->shrink_work, shrink_worker);
- atomic_set(&pool->nr_stored, 0);
zswap_pool_debug("created", pool);
return pool;
-lru_fail:
- list_lru_destroy(&pool->list_lru);
- shrinker_free(pool->shrinker);
+ref_fail:
+ cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -1155,29 +430,14 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
- shrinker_free(pool->shrinker);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
- list_lru_destroy(&pool->list_lru);
-
- spin_lock(&zswap_pools_lock);
- mem_cgroup_iter_break(NULL, pool->next_shrink);
- pool->next_shrink = NULL;
- spin_unlock(&zswap_pools_lock);
for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
}
-static int __must_check zswap_pool_get(struct zswap_pool *pool)
-{
- if (!pool)
- return 0;
-
- return kref_get_unless_zero(&pool->kref);
-}
-
static void __zswap_pool_release(struct work_struct *work)
{
struct zswap_pool *pool = container_of(work, typeof(*pool),
@@ -1185,20 +445,23 @@ static void __zswap_pool_release(struct work_struct *work)
synchronize_rcu();
- /* nobody should have been able to get a kref... */
- WARN_ON(kref_get_unless_zero(&pool->kref));
+ /* nobody should have been able to get a ref... */
+ WARN_ON(!percpu_ref_is_zero(&pool->ref));
+ percpu_ref_exit(&pool->ref);
/* pool is now off zswap_pools list and has no references. */
zswap_pool_destroy(pool);
}
-static void __zswap_pool_empty(struct kref *kref)
+static struct zswap_pool *zswap_pool_current(void);
+
+static void __zswap_pool_empty(struct percpu_ref *ref)
{
struct zswap_pool *pool;
- pool = container_of(kref, typeof(*pool), kref);
+ pool = container_of(ref, typeof(*pool), ref);
- spin_lock(&zswap_pools_lock);
+ spin_lock_bh(&zswap_pools_lock);
WARN_ON(pool == zswap_pool_current());
@@ -1207,12 +470,75 @@ static void __zswap_pool_empty(struct kref *kref)
INIT_WORK(&pool->release_work, __zswap_pool_release);
schedule_work(&pool->release_work);
- spin_unlock(&zswap_pools_lock);
+ spin_unlock_bh(&zswap_pools_lock);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+ if (!pool)
+ return 0;
+
+ return percpu_ref_tryget(&pool->ref);
}
static void zswap_pool_put(struct zswap_pool *pool)
{
- kref_put(&pool->kref, __zswap_pool_empty);
+ percpu_ref_put(&pool->ref);
+}
+
+static struct zswap_pool *__zswap_pool_current(void)
+{
+ struct zswap_pool *pool;
+
+ pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+ WARN_ONCE(!pool && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+
+ return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+ assert_spin_locked(&zswap_pools_lock);
+
+ return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+ struct zswap_pool *pool;
+
+ rcu_read_lock();
+
+ pool = __zswap_pool_current();
+ if (!zswap_pool_get(pool))
+ pool = NULL;
+
+ rcu_read_unlock();
+
+ return pool;
+}
+
+/* type and compressor must be null-terminated */
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+ struct zswap_pool *pool;
+
+ assert_spin_locked(&zswap_pools_lock);
+
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ if (strcmp(pool->tfm_name, compressor))
+ continue;
+ /* all zpools share the same type */
+ if (strcmp(zpool_get_type(pool->zpools[0]), type))
+ continue;
+ /* if we can't get it, it's about to be destroyed */
+ if (!zswap_pool_get(pool))
+ continue;
+ return pool;
+ }
+
+ return NULL;
}
/*********************************
@@ -1274,7 +600,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
return -EINVAL;
}
- spin_lock(&zswap_pools_lock);
+ spin_lock_bh(&zswap_pools_lock);
pool = zswap_pool_find_get(type, compressor);
if (pool) {
@@ -1283,17 +609,28 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
list_del_rcu(&pool->list);
}
- spin_unlock(&zswap_pools_lock);
+ spin_unlock_bh(&zswap_pools_lock);
if (!pool)
pool = zswap_pool_create(type, compressor);
+ else {
+ /*
+ * Restore the initial ref dropped by percpu_ref_kill()
+ * when the pool was decommissioned and switch it again
+ * to percpu mode.
+ */
+ percpu_ref_resurrect(&pool->ref);
+
+ /* Drop the ref from zswap_pool_find_get(). */
+ zswap_pool_put(pool);
+ }
if (pool)
ret = param_set_charp(s, kp);
else
ret = -EINVAL;
- spin_lock(&zswap_pools_lock);
+ spin_lock_bh(&zswap_pools_lock);
if (!ret) {
put_pool = zswap_pool_current();
@@ -1308,7 +645,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
put_pool = pool;
}
- spin_unlock(&zswap_pools_lock);
+ spin_unlock_bh(&zswap_pools_lock);
if (!zswap_has_pool && !pool) {
/* if initial pool creation failed, and this pool creation also
@@ -1325,7 +662,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
* or the new pool we failed to add
*/
if (put_pool)
- zswap_pool_put(put_pool);
+ percpu_ref_kill(&put_pool->ref);
return ret;
}
@@ -1371,7 +708,368 @@ static int zswap_enabled_param_set(const char *val,
return ret;
}
-static void __zswap_load(struct zswap_entry *entry, struct page *page)
+/*********************************
+* lru functions
+**********************************/
+
+/* should be called under RCU */
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+ return NULL;
+}
+#endif
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+ return page_to_nid(virt_to_page(entry));
+}
+
+static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ atomic_long_t *nr_zswap_protected;
+ unsigned long lru_size, old, new;
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ /*
+ * Note that it is safe to use rcu_read_lock() here, even in the face of
+ * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
+ * used in list_lru lookup, only two scenarios are possible:
+ *
+ * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
+ * new entry will be reparented to memcg's parent's list_lru.
+ * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
+ * new entry will be added directly to memcg's parent's list_lru.
+ *
+ * Similar reasoning holds for list_lru_del().
+ */
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+ /* Update the protection area */
+ lru_size = list_lru_count_one(list_lru, nid, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+ old = atomic_long_inc_return(nr_zswap_protected);
+ /*
+ * Decay to avoid overflow and adapt to changing workloads.
+ * This is based on LRU reclaim cost decaying heuristics.
+ */
+ do {
+ new = old > lru_size / 4 ? old / 2 : old;
+ } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
+ rcu_read_unlock();
+}
+
+static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+ int nid = entry_to_nid(entry);
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_entry(entry);
+ /* will always succeed */
+ list_lru_del(list_lru, &entry->lru, nid, memcg);
+ rcu_read_unlock();
+}
+
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_folio_swapin(struct folio *folio)
+{
+ struct lruvec *lruvec;
+
+ if (folio) {
+ lruvec = folio_lruvec(folio);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ }
+}
+
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
+{
+ /* lock out zswap shrinker walking memcg tree */
+ spin_lock(&zswap_shrink_lock);
+ if (zswap_next_shrink == memcg)
+ zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ spin_unlock(&zswap_shrink_lock);
+}
+
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct zswap_entry *entry;
+ pgoff_t entry_offset;
+
+ while (node) {
+ entry = rb_entry(node, struct zswap_entry, rbnode);
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
+ node = node->rb_left;
+ else if (entry_offset < offset)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+ struct zswap_entry **dupentry)
+{
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
+
+ while (*link) {
+ parent = *link;
+ myentry = rb_entry(parent, struct zswap_entry, rbnode);
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
+ link = &(*link)->rb_left;
+ else if (myentry_offset < entry_offset)
+ link = &(*link)->rb_right;
+ else {
+ *dupentry = myentry;
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&entry->rbnode, parent, link);
+ rb_insert_color(&entry->rbnode, root);
+ return 0;
+}
+
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+}
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
+{
+ struct zswap_entry *entry;
+ entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
+ if (!entry)
+ return NULL;
+ RB_CLEAR_NODE(&entry->rbnode);
+ return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+ kmem_cache_free(zswap_entry_cache, entry);
+}
+
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+ int i = 0;
+
+ if (ZSWAP_NR_ZPOOLS > 1)
+ i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+ return entry->pool->zpools[i];
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_entry_free(struct zswap_entry *entry)
+{
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zswap_lru_del(&zswap_list_lru, entry);
+ zpool_free(zswap_find_zpool(entry), entry->handle);
+ atomic_dec(&zswap_nr_stored);
+ zswap_pool_put(entry->pool);
+ }
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_update_total_size();
+}
+
+/*
+ * The caller hold the tree lock and search the entry from the tree,
+ * so it must be on the tree, remove it from the tree and free it.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_entry_free(entry);
+}
+
+/*********************************
+* compressed storage functions
+**********************************/
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+ int ret;
+
+ mutex_init(&acomp_ctx->mutex);
+
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return -ENOMEM;
+
+ acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp));
+ ret = PTR_ERR(acomp);
+ goto acomp_fail;
+ }
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+
+ req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ ret = -ENOMEM;
+ goto req_fail;
+ }
+ acomp_ctx->req = req;
+
+ crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ return 0;
+
+req_fail:
+ crypto_free_acomp(acomp_ctx->acomp);
+acomp_fail:
+ kfree(acomp_ctx->buffer);
+ return ret;
+}
+
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx)) {
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+ kfree(acomp_ctx->buffer);
+ }
+
+ return 0;
+}
+
+static bool zswap_compress(struct folio *folio, struct zswap_entry *entry)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+ struct scatterlist input, output;
+ int comp_ret = 0, alloc_ret = 0;
+ unsigned int dlen = PAGE_SIZE;
+ unsigned long handle;
+ struct zpool *zpool;
+ char *buf;
+ gfp_t gfp;
+ u8 *dst;
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+ mutex_lock(&acomp_ctx->mutex);
+
+ dst = acomp_ctx->buffer;
+ sg_init_table(&input, 1);
+ sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
+
+ /*
+ * We need PAGE_SIZE * 2 here since there maybe over-compression case,
+ * and hardware-accelerators may won't check the dst buffer size, so
+ * giving the dst buffer with enough length to avoid buffer overflow.
+ */
+ sg_init_one(&output, dst, PAGE_SIZE * 2);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+
+ /*
+ * it maybe looks a little bit silly that we send an asynchronous request,
+ * then wait for its completion synchronously. This makes the process look
+ * synchronous in fact.
+ * Theoretically, acomp supports users send multiple acomp requests in one
+ * acomp instance, then get those requests done simultaneously. but in this
+ * case, zswap actually does store and load page by page, there is no
+ * existing method to send the second page before the first page is done
+ * in one thread doing zwap.
+ * but in different threads running on different cpu, we have different
+ * acomp instance, so multiple threads can do (de)compression in parallel.
+ */
+ comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ if (comp_ret)
+ goto unlock;
+
+ zpool = zswap_find_zpool(entry);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
+ if (alloc_ret)
+ goto unlock;
+
+ buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
+ memcpy(buf, dst, dlen);
+ zpool_unmap_handle(zpool, handle);
+
+ entry->handle = handle;
+ entry->length = dlen;
+
+unlock:
+ if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
+ zswap_reject_compress_poor++;
+ else if (comp_ret)
+ zswap_reject_compress_fail++;
+ else if (alloc_ret)
+ zswap_reject_alloc_fail++;
+
+ mutex_unlock(&acomp_ctx->mutex);
+ return comp_ret == 0 && alloc_ret == 0;
+}
+
+static void zswap_decompress(struct zswap_entry *entry, struct page *page)
{
struct zpool *zpool = zswap_find_zpool(entry);
struct scatterlist input, output;
@@ -1382,7 +1080,17 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
mutex_lock(&acomp_ctx->mutex);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
- if (!zpool_can_sleep_mapped(zpool)) {
+ /*
+ * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
+ * to do crypto_acomp_decompress() which might sleep. In such cases, we must
+ * resort to copying the buffer to a temporary one.
+ * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer,
+ * such as a kmap address of high memory or even ever a vmap address.
+ * However, sg_init_one is only equipped to handle linearly mapped low memory.
+ * In such cases, we also must copy the buffer to a temporary and lowmem one.
+ */
+ if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
+ !virt_addr_valid(src)) {
memcpy(acomp_ctx->buffer, src, entry->length);
src = acomp_ctx->buffer;
zpool_unmap_handle(zpool, entry->handle);
@@ -1396,7 +1104,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
mutex_unlock(&acomp_ctx->mutex);
- if (zpool_can_sleep_mapped(zpool))
+ if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
}
@@ -1416,9 +1124,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page)
* freed.
*/
static int zswap_writeback_entry(struct zswap_entry *entry,
- struct zswap_tree *tree)
+ swp_entry_t swpentry)
{
- swp_entry_t swpentry = entry->swpentry;
+ struct zswap_tree *tree;
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1434,9 +1142,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return -ENOMEM;
/*
- * Found an existing folio, we raced with load/swapin. We generally
- * writeback cold folios from zswap, and swapin means the folio just
- * became hot. Skip this folio and let the caller find another one.
+ * Found an existing folio, we raced with swapin or concurrent
+ * shrinker. We generally writeback cold folios from zswap, and
+ * swapin means the folio just became hot, so skip this folio.
+ * For unlikely concurrent shrinker case, it will be unlinked
+ * and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
folio_put(folio);
@@ -1445,22 +1155,34 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*
* folio is locked, and the swapcache is now secured against
- * concurrent swapping to and from the slot. Verify that the
- * swap entry hasn't been invalidated and recycled behind our
- * backs (our zswap_entry reference doesn't prevent that), to
- * avoid overwriting a new swap folio with old compressed data.
+ * concurrent swapping to and from the slot, and concurrent
+ * swapoff so we can safely dereference the zswap tree here.
+ * Verify that the swap entry hasn't been invalidated and recycled
+ * behind our backs, to avoid overwriting a new swap folio with
+ * old compressed data. Only when this is successful can the entry
+ * be dereferenced.
*/
+ tree = swap_zswap_tree(swpentry);
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
+ if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
folio_unlock(folio);
folio_put(folio);
return -ENOMEM;
}
+
+ /* Safe to deref entry after the entry is verified above. */
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
- __zswap_load(entry, &folio->page);
+ zswap_decompress(entry, &folio->page);
+
+ count_vm_event(ZSWPWB);
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPWB);
+
+ zswap_entry_free(entry);
/* folio is up to date */
folio_mark_uptodate(folio);
@@ -1475,6 +1197,274 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return 0;
}
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg)
+{
+ struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+ bool *encountered_page_in_swapcache = (bool *)arg;
+ swp_entry_t swpentry;
+ enum lru_status ret = LRU_REMOVED_RETRY;
+ int writeback_result;
+
+ /*
+ * As soon as we drop the LRU lock, the entry can be freed by
+ * a concurrent invalidation. This means the following:
+ *
+ * 1. We extract the swp_entry_t to the stack, allowing
+ * zswap_writeback_entry() to pin the swap entry and
+ * then validate the zwap entry against that swap entry's
+ * tree using pointer value comparison. Only when that
+ * is successful can the entry be dereferenced.
+ *
+ * 2. Usually, objects are taken off the LRU for reclaim. In
+ * this case this isn't possible, because if reclaim fails
+ * for whatever reason, we have no means of knowing if the
+ * entry is alive to put it back on the LRU.
+ *
+ * So rotate it before dropping the lock. If the entry is
+ * written back or invalidated, the free path will unlink
+ * it. For failures, rotation is the right thing as well.
+ *
+ * Temporary failures, where the same entry should be tried
+ * again immediately, almost never happen for this shrinker.
+ * We don't do any trylocking; -ENOMEM comes closest,
+ * but that's extremely rare and doesn't happen spuriously
+ * either. Don't bother distinguishing this case.
+ */
+ list_move_tail(item, &l->list);
+
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpentry is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpentry = entry->swpentry;
+
+ /*
+ * It's safe to drop the lock here because we return either
+ * LRU_REMOVED_RETRY or LRU_RETRY.
+ */
+ spin_unlock(lock);
+
+ writeback_result = zswap_writeback_entry(entry, swpentry);
+
+ if (writeback_result) {
+ zswap_reject_reclaim_fail++;
+ ret = LRU_RETRY;
+
+ /*
+ * Encountering a page already in swap cache is a sign that we are shrinking
+ * into the warmer region. We should terminate shrinking (if we're in the dynamic
+ * shrinker context).
+ */
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ ret = LRU_STOP;
+ *encountered_page_in_swapcache = true;
+ }
+ } else {
+ zswap_written_back_pages++;
+ }
+
+ spin_lock(lock);
+ return ret;
+}
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ unsigned long shrink_ret, nr_protected, lru_size;
+ bool encountered_page_in_swapcache = false;
+
+ if (!zswap_shrinker_enabled ||
+ !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
+
+ /*
+ * Abort if we are shrinking into the protected region.
+ *
+ * This short-circuiting is necessary because if we have too many multiple
+ * concurrent reclaimers getting the freeable zswap object counts at the
+ * same time (before any of them made reasonable progress), the total
+ * number of reclaimed objects might be more than the number of unprotected
+ * objects (i.e the reclaimers will reclaim into the protected area of the
+ * zswap LRU).
+ */
+ if (nr_protected >= lru_size - sc->nr_to_scan) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
+ &encountered_page_in_swapcache);
+
+ if (encountered_page_in_swapcache)
+ return SHRINK_STOP;
+
+ return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct mem_cgroup *memcg = sc->memcg;
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+ if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
+ return 0;
+
+ /*
+ * The shrinker resumes swap writeback, which will enter block
+ * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS
+ * rules (may_enter_fs()), which apply on a per-folio basis.
+ */
+ if (!gfp_has_io_fs(sc->gfp_mask))
+ return 0;
+
+ /*
+ * For memcg, use the cgroup-wide ZSWAP stats since we don't
+ * have them per-node and thus per-lruvec. Careful if memcg is
+ * runtime-disabled: we can get sc->memcg == NULL, which is ok
+ * for the lruvec, but not for memcg_page_state().
+ *
+ * Without memcg, use the zswap pool-wide metrics.
+ */
+ if (!mem_cgroup_disabled()) {
+ mem_cgroup_flush_stats(memcg);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+ } else {
+ nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
+ nr_stored = atomic_read(&zswap_nr_stored);
+ }
+
+ if (!nr_stored)
+ return 0;
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
+ /*
+ * Subtract the lru size by an estimate of the number of pages
+ * that should be protected.
+ */
+ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+ /*
+ * Scale the number of freeable pages by the memory saving factor.
+ * This ensures that the better zswap compresses memory, the fewer
+ * pages we will evict to swap (as it will otherwise incur IO for
+ * relatively small memory saving).
+ */
+ return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static struct shrinker *zswap_alloc_shrinker(void)
+{
+ struct shrinker *shrinker;
+
+ shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+ if (!shrinker)
+ return NULL;
+
+ shrinker->scan_objects = zswap_shrinker_scan;
+ shrinker->count_objects = zswap_shrinker_count;
+ shrinker->batch = 0;
+ shrinker->seeks = DEFAULT_SEEKS;
+ return shrinker;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+ int nid, shrunk = 0;
+
+ if (!mem_cgroup_zswap_writeback_enabled(memcg))
+ return -EINVAL;
+
+ /*
+ * Skip zombies because their LRUs are reparented and we would be
+ * reclaiming from the parent instead of the dead memcg.
+ */
+ if (memcg && !mem_cgroup_online(memcg))
+ return -ENOENT;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long nr_to_walk = 1;
+
+ shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
+ &shrink_memcg_cb, NULL, &nr_to_walk);
+ }
+ return shrunk ? 0 : -EAGAIN;
+}
+
+static void shrink_worker(struct work_struct *w)
+{
+ struct mem_cgroup *memcg;
+ int ret, failures = 0;
+
+ /* global reclaim will select cgroup in a round-robin fashion. */
+ do {
+ spin_lock(&zswap_shrink_lock);
+ zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ memcg = zswap_next_shrink;
+
+ /*
+ * We need to retry if we have gone through a full round trip, or if we
+ * got an offline memcg (or else we risk undoing the effect of the
+ * zswap memcg offlining cleanup callback). This is not catastrophic
+ * per se, but it will keep the now offlined memcg hostage for a while.
+ *
+ * Note that if we got an online memcg, we will keep the extra
+ * reference in case the original reference obtained by mem_cgroup_iter
+ * is dropped by the zswap memcg offlining callback, ensuring that the
+ * memcg is not killed when we are reclaiming.
+ */
+ if (!memcg) {
+ spin_unlock(&zswap_shrink_lock);
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+
+ if (!mem_cgroup_tryget_online(memcg)) {
+ /* drop the reference from mem_cgroup_iter() */
+ mem_cgroup_iter_break(NULL, memcg);
+ zswap_next_shrink = NULL;
+ spin_unlock(&zswap_shrink_lock);
+
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+ goto resched;
+ }
+ spin_unlock(&zswap_shrink_lock);
+
+ ret = shrink_memcg(memcg);
+ /* drop the extra reference */
+ mem_cgroup_put(memcg);
+
+ if (ret == -EINVAL)
+ break;
+ if (ret && ++failures == MAX_RECLAIM_RETRIES)
+ break;
+
+resched:
+ cond_resched();
+ } while (!zswap_can_accept());
+}
+
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned long *page;
@@ -1508,23 +1498,11 @@ static void zswap_fill_page(void *ptr, unsigned long value)
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
- struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry, *dupentry;
- struct scatterlist input, output;
- struct crypto_acomp_ctx *acomp_ctx;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- struct zswap_pool *pool;
- struct zpool *zpool;
- unsigned int dlen = PAGE_SIZE;
- unsigned long handle, value;
- char *buf;
- u8 *src, *dst;
- gfp_t gfp;
- int ret;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1533,24 +1511,8 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!tree)
- return false;
-
- /*
- * If this is a duplicate, it must be removed before attempting to store
- * it, otherwise, if the store fails the old page won't be removed from
- * the tree, and it might be written back overriding the new data.
- */
- spin_lock(&tree->lock);
- dupentry = zswap_rb_search(&tree->rbroot, offset);
- if (dupentry) {
- zswap_duplicate_entry++;
- zswap_invalidate_entry(tree, dupentry);
- }
- spin_unlock(&tree->lock);
-
if (!zswap_enabled)
- return false;
+ goto check_old;
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
@@ -1577,17 +1539,19 @@ bool zswap_store(struct folio *folio)
}
/* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
+ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
if (!entry) {
zswap_reject_kmemcache_fail++;
goto reject;
}
if (zswap_same_filled_pages_enabled) {
- src = kmap_local_page(page);
+ unsigned long value;
+ u8 *src;
+
+ src = kmap_local_folio(folio, 0);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_local(src);
- entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1606,74 +1570,18 @@ bool zswap_store(struct folio *folio)
if (objcg) {
memcg = get_mem_cgroup_from_objcg(objcg);
- if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
+ if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) {
mem_cgroup_put(memcg);
goto put_pool;
}
mem_cgroup_put(memcg);
}
- /* compress */
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
- dst = acomp_ctx->buffer;
- sg_init_table(&input, 1);
- sg_set_page(&input, &folio->page, PAGE_SIZE, 0);
-
- /*
- * We need PAGE_SIZE * 2 here since there maybe over-compression case,
- * and hardware-accelerators may won't check the dst buffer size, so
- * giving the dst buffer with enough length to avoid buffer overflow.
- */
- sg_init_one(&output, dst, PAGE_SIZE * 2);
- acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
- /*
- * it maybe looks a little bit silly that we send an asynchronous request,
- * then wait for its completion synchronously. This makes the process look
- * synchronous in fact.
- * Theoretically, acomp supports users send multiple acomp requests in one
- * acomp instance, then get those requests done simultaneously. but in this
- * case, zswap actually does store and load page by page, there is no
- * existing method to send the second page before the first page is done
- * in one thread doing zwap.
- * but in different threads running on different cpu, we have different
- * acomp instance, so multiple threads can do (de)compression in parallel.
- */
- ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
-
- if (ret) {
- zswap_reject_compress_fail++;
- goto put_dstmem;
- }
-
- /* store */
- zpool = zswap_find_zpool(entry);
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(zpool, dlen, gfp, &handle);
- if (ret == -ENOSPC) {
- zswap_reject_compress_poor++;
- goto put_dstmem;
- }
- if (ret) {
- zswap_reject_alloc_fail++;
- goto put_dstmem;
- }
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
- mutex_unlock(&acomp_ctx->mutex);
-
- /* populate entry */
- entry->swpentry = swp_entry(type, offset);
- entry->handle = handle;
- entry->length = dlen;
+ if (!zswap_compress(folio, entry))
+ goto put_pool;
insert_entry:
+ entry->swpentry = swp;
entry->objcg = objcg;
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
@@ -1684,20 +1592,17 @@ insert_entry:
/* map */
spin_lock(&tree->lock);
/*
- * A duplicate entry should have been removed at the beginning of this
- * function. Since the swap entry should be pinned, if a duplicate is
- * found again here it means that something went wrong in the swap
- * cache.
+ * The folio may have been dirtied again, invalidate the
+ * possibly stale entry before inserting the new entry.
*/
- while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
- WARN_ON(1);
- zswap_duplicate_entry++;
+ if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
zswap_invalidate_entry(tree, dupentry);
+ WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
}
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
- zswap_lru_add(&entry->pool->list_lru, entry);
- atomic_inc(&entry->pool->nr_stored);
+ zswap_lru_add(&zswap_list_lru, entry);
+ atomic_inc(&zswap_nr_stored);
}
spin_unlock(&tree->lock);
@@ -1708,8 +1613,6 @@ insert_entry:
return true;
-put_dstmem:
- mutex_unlock(&acomp_ctx->mutex);
put_pool:
zswap_pool_put(entry->pool);
freepage:
@@ -1717,38 +1620,60 @@ freepage:
reject:
if (objcg)
obj_cgroup_put(objcg);
+check_old:
+ /*
+ * If the zswap store fails or zswap is disabled, we must invalidate the
+ * possibly stale entry which was previously stored at this offset.
+ * Otherwise, writeback could overwrite the new data in the swapfile.
+ */
+ spin_lock(&tree->lock);
+ entry = zswap_rb_search(&tree->rbroot, offset);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
+ spin_unlock(&tree->lock);
return false;
shrink:
- pool = zswap_pool_last_get();
- if (pool && !queue_work(shrink_wq, &pool->shrink_work))
- zswap_pool_put(pool);
+ queue_work(shrink_wq, &zswap_shrink_work);
goto reject;
}
bool zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
- int type = swp_type(swp);
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
- struct zswap_tree *tree = zswap_trees[type];
+ bool swapcache = folio_test_swapcache(folio);
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- /* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_rb_search(&tree->rbroot, offset);
if (!entry) {
spin_unlock(&tree->lock);
return false;
}
+ /*
+ * When reading into the swapcache, invalidate our entry. The
+ * swapcache can be the authoritative owner of the page and
+ * its mappings, and the pressure that results from having two
+ * in-memory copies outweighs any benefits of caching the
+ * compression work.
+ *
+ * (Most swapins go through the swapcache. The notable
+ * exception is the singleton fault on SWP_SYNCHRONOUS_IO
+ * files, which reads into a private page and may free it if
+ * the fault fails. We remain the primary owner of the entry.)
+ */
+ if (swapcache)
+ zswap_rb_erase(&tree->rbroot, entry);
spin_unlock(&tree->lock);
if (entry->length)
- __zswap_load(entry, page);
+ zswap_decompress(entry, page);
else {
dst = kmap_local_page(page);
zswap_fill_page(dst, entry->value);
@@ -1759,67 +1684,64 @@ bool zswap_load(struct folio *folio)
if (entry->objcg)
count_objcg_event(entry->objcg, ZSWPIN);
- spin_lock(&tree->lock);
- if (zswap_exclusive_loads_enabled) {
- zswap_invalidate_entry(tree, entry);
+ if (swapcache) {
+ zswap_entry_free(entry);
folio_mark_dirty(folio);
- } else if (entry->length) {
- zswap_lru_del(&entry->pool->list_lru, entry);
- zswap_lru_add(&entry->pool->list_lru, entry);
}
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
return true;
}
-void zswap_invalidate(int type, pgoff_t offset)
+void zswap_invalidate(swp_entry_t swp)
{
- struct zswap_tree *tree = zswap_trees[type];
+ pgoff_t offset = swp_offset(swp);
+ struct zswap_tree *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
- /* find */
spin_lock(&tree->lock);
entry = zswap_rb_search(&tree->rbroot, offset);
- if (!entry) {
- /* entry was written back */
- spin_unlock(&tree->lock);
- return;
- }
- zswap_invalidate_entry(tree, entry);
+ if (entry)
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}
-void zswap_swapon(int type)
+int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *tree;
+ struct zswap_tree *trees, *tree;
+ unsigned int nr, i;
- tree = kzalloc(sizeof(*tree), GFP_KERNEL);
- if (!tree) {
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL);
+ if (!trees) {
pr_err("alloc failed, zswap disabled for swap type %d\n", type);
- return;
+ return -ENOMEM;
}
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- zswap_trees[type] = tree;
+ for (i = 0; i < nr; i++) {
+ tree = trees + i;
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ }
+
+ nr_zswap_trees[type] = nr;
+ zswap_trees[type] = trees;
+ return 0;
}
void zswap_swapoff(int type)
{
- struct zswap_tree *tree = zswap_trees[type];
- struct zswap_entry *entry, *n;
+ struct zswap_tree *trees = zswap_trees[type];
+ unsigned int i;
- if (!tree)
+ if (!trees)
return;
- /* walk the tree and free everything */
- spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(entry);
- tree->rbroot = RB_ROOT;
- spin_unlock(&tree->lock);
- kfree(tree);
+ /* try_to_unuse() invalidated all the entries already */
+ for (i = 0; i < nr_zswap_trees[type]; i++)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
+
+ kvfree(trees);
+ nr_zswap_trees[type] = 0;
zswap_trees[type] = NULL;
}
@@ -1852,8 +1774,6 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("duplicate_entry", 0444,
- zswap_debugfs_root, &zswap_duplicate_entry);
debugfs_create_u64("pool_total_size", 0444,
zswap_debugfs_root, &zswap_pool_total_size);
debugfs_create_atomic_t("stored_pages", 0444,
@@ -1891,6 +1811,20 @@ static int zswap_setup(void)
if (ret)
goto hp_fail;
+ shrink_wq = alloc_workqueue("zswap-shrink",
+ WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ if (!shrink_wq)
+ goto shrink_wq_fail;
+
+ zswap_shrinker = zswap_alloc_shrinker();
+ if (!zswap_shrinker)
+ goto shrinker_fail;
+ if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker))
+ goto lru_fail;
+ shrinker_register(zswap_shrinker);
+
+ INIT_WORK(&zswap_shrink_work, shrink_worker);
+
pool = __zswap_pool_create_fallback();
if (pool) {
pr_info("loaded using pool %s/%s\n", pool->tfm_name,
@@ -1902,18 +1836,17 @@ static int zswap_setup(void)
zswap_enabled = false;
}
- shrink_wq = create_workqueue("zswap-shrink");
- if (!shrink_wq)
- goto fallback_fail;
-
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
zswap_init_state = ZSWAP_INIT_SUCCEED;
return 0;
-fallback_fail:
- if (pool)
- zswap_pool_destroy(pool);
+lru_fail:
+ shrinker_free(zswap_shrinker);
+shrinker_fail:
+ destroy_workqueue(shrink_wq);
+shrink_wq_fail:
+ cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE);
hp_fail:
kmem_cache_destroy(zswap_entry_cache);
cache_fail: