diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:52 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:52 +0000 |
commit | 3afb00d3f86d3d924f88b56fa8285d4e9db85852 (patch) | |
tree | 95a985d3019522cea546b7d8df621369bc44fc6c /mm | |
parent | Adding debian version 6.9.12-1. (diff) | |
download | linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.tar.xz linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.zip |
Merging upstream version 6.10.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'mm')
85 files changed, 5730 insertions, 4231 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b1448aa81e..b4cb45255a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -333,10 +333,9 @@ config SHUFFLE_PAGE_ALLOCATOR While the randomization improves cache utilization it may negatively impact workloads on platforms without a cache. For - this reason, by default, the randomization is enabled only - after runtime detection of a direct-mapped memory-side-cache. - Otherwise, the randomization may be force enabled with the - 'page_alloc.shuffle' kernel command line parameter. + this reason, by default, the randomization is not enabled even + if SHUFFLE_PAGE_ALLOCATOR=y. The randomization may be force enabled + with the 'page_alloc.shuffle' kernel command line parameter. Say Y if unsure. @@ -473,7 +472,7 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_FAST_GUP +config HAVE_GUP_FAST depends on MMU bool @@ -850,6 +849,12 @@ config READ_ONLY_THP_FOR_FS endif # TRANSPARENT_HUGEPAGE # +# The architecture supports pgtable leaves that is larger than PAGE_SIZE +# +config PGTABLE_HAS_HUGE_LEAVES + def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE + +# # UP and nommu archs use km based percpu allocator # config NEED_PER_CPU_KM @@ -1241,6 +1246,9 @@ config LOCK_MM_AND_FIND_VMA config IOMMU_MM_DATA bool +config EXECMEM + bool + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 4abb40b911..8fb85acda1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +KASAN_SANITIZE_kmemleak.o := n KCSAN_SANITIZE_kmemleak.o := n # These produce frequent data race reports: most of them are due to races on @@ -42,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif +ifdef CONFIG_64BIT +mmu-$(CONFIG_MMU) += mseal.o +endif + obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ readahead.o swap.o truncate.o vmscan.o shrinker.o \ @@ -133,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_EXECMEM) += execmem.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5f2be8c8df..e61bbb1bd6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq; #include <linux/debugfs.h> #include <linux/seq_file.h> +struct wb_stats { + unsigned long nr_dirty; + unsigned long nr_io; + unsigned long nr_more_io; + unsigned long nr_dirty_time; + unsigned long nr_writeback; + unsigned long nr_reclaimable; + unsigned long nr_dirtied; + unsigned long nr_written; + unsigned long dirty_thresh; + unsigned long wb_thresh; +}; + static struct dentry *bdi_debug_root; static void bdi_debug_init(void) @@ -46,31 +59,68 @@ static void bdi_debug_init(void) bdi_debug_root = debugfs_create_dir("bdi", NULL); } -static int bdi_debug_stats_show(struct seq_file *m, void *v) +static void collect_wb_stats(struct wb_stats *stats, + struct bdi_writeback *wb) { - struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb = &bdi->wb; - unsigned long background_thresh; - unsigned long dirty_thresh; - unsigned long wb_thresh; - unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) - nr_dirty++; + stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) - nr_io++; + stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) - nr_more_io++; + stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) - nr_dirty_time++; + stats->nr_dirty_time++; spin_unlock(&wb->list_lock); + stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); + stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); + stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); + stats->nr_written += wb_stat(wb, WB_WRITTEN); + stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +static void bdi_collect_stats(struct backing_dev_info *bdi, + struct wb_stats *stats) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + if (!wb_tryget(wb)) + continue; + + collect_wb_stats(stats, wb); + wb_put(wb); + } + rcu_read_unlock(); +} +#else +static void bdi_collect_stats(struct backing_dev_info *bdi, + struct wb_stats *stats) +{ + collect_wb_stats(stats, &bdi->wb); +} +#endif + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + unsigned long background_thresh; + unsigned long dirty_thresh; + struct wb_stats stats; + unsigned long tot_bw; + global_dirty_limits(&background_thresh, &dirty_thresh); - wb_thresh = wb_calc_thresh(wb, dirty_thresh); + + memset(&stats, 0, sizeof(stats)); + stats.dirty_thresh = dirty_thresh; + bdi_collect_stats(bdi, &stats); + tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" @@ -87,37 +137,114 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), - (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), - K(wb_thresh), + K(stats.nr_writeback), + K(stats.nr_reclaimable), + K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(wb_stat(wb, WB_DIRTIED)), - (unsigned long) K(wb_stat(wb, WB_WRITTEN)), - (unsigned long) K(wb->write_bandwidth), - nr_dirty, - nr_io, - nr_more_io, - nr_dirty_time, + K(stats.nr_dirtied), + K(stats.nr_written), + K(tot_bw), + stats.nr_dirty, + stats.nr_io, + stats.nr_more_io, + stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); +static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, + struct wb_stats *stats) +{ + + seq_printf(m, + "WbCgIno: %10lu\n" + "WbWriteback: %10lu kB\n" + "WbReclaimable: %10lu kB\n" + "WbDirtyThresh: %10lu kB\n" + "WbDirtied: %10lu kB\n" + "WbWritten: %10lu kB\n" + "WbWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" + "state: %10lx\n\n", +#ifdef CONFIG_CGROUP_WRITEBACK + cgroup_ino(wb->memcg_css->cgroup), +#else + 1ul, +#endif + K(stats->nr_writeback), + K(stats->nr_reclaimable), + K(stats->wb_thresh), + K(stats->nr_dirtied), + K(stats->nr_written), + K(wb->avg_write_bandwidth), + stats->nr_dirty, + stats->nr_io, + stats->nr_more_io, + stats->nr_dirty_time, + wb->state); +} + +static int cgwb_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + unsigned long background_thresh; + unsigned long dirty_thresh; + struct bdi_writeback *wb; + + global_dirty_limits(&background_thresh, &dirty_thresh); + + rcu_read_lock(); + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + struct wb_stats stats = { .dirty_thresh = dirty_thresh }; + + if (!wb_tryget(wb)) + continue; + + collect_wb_stats(&stats, wb); + + /* + * Calculate thresh of wb in writeback cgroup which is min of + * thresh in global domain and thresh in cgroup domain. Drop + * rcu lock because cgwb_calc_thresh may sleep in + * cgroup_rstat_flush. We can do so here because we have a ref. + */ + if (mem_cgroup_wb_domain(wb)) { + rcu_read_unlock(); + stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); + rcu_read_lock(); + } + + wb_stats_show(m, wb, &stats); + + wb_put(wb); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); + static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); + debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, + &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } -#else +#else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } @@ -128,7 +255,7 @@ static inline void bdi_debug_register(struct backing_dev_info *bdi, static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } -#endif +#endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, @@ -388,7 +515,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { - int i, err; + int err; memset(wb, 0, sizeof(*wb)); @@ -416,18 +543,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, if (err) return err; - for (i = 0; i < NR_WB_STAT_ITEMS; i++) { - err = percpu_counter_init(&wb->stat[i], 0, gfp); - if (err) - goto out_destroy_stat; - } - - return 0; + err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); + if (err) + fprop_local_destroy_percpu(&wb->completions); -out_destroy_stat: - while (i--) - percpu_counter_destroy(&wb->stat[i]); - fprop_local_destroy_percpu(&wb->completions); return err; } @@ -460,13 +579,8 @@ static void wb_shutdown(struct bdi_writeback *wb) static void wb_exit(struct bdi_writeback *wb) { - int i; - WARN_ON(delayed_work_pending(&wb->dwork)); - - for (i = 0; i < NR_WB_STAT_ITEMS; i++) - percpu_counter_destroy(&wb->stat[i]); - + percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } diff --git a/mm/compaction.c b/mm/compaction.c index 807b58e6eb..739b1bf3d6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -79,6 +79,13 @@ static inline bool is_via_compact_memory(int order) { return false; } #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif +static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) +{ + post_alloc_hook(page, order, __GFP_MOVABLE); + return page; +} +#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) + static void split_map_pages(struct list_head *freepages) { unsigned int i, order; @@ -93,7 +100,7 @@ static void split_map_pages(struct list_head *freepages) nr_pages = 1 << order; - post_alloc_hook(page, order, __GFP_MOVABLE); + mark_allocated(page, order, __GFP_MOVABLE); if (order) split_page(page, order); @@ -122,7 +129,7 @@ static unsigned long release_free_list(struct list_head *freepages) * Convert free pages into post allocation pages, so * that we can free them via __free_page. */ - post_alloc_hook(page, order, __GFP_MOVABLE); + mark_allocated(page, order, __GFP_MOVABLE); __free_pages(page, order); if (pfn > high_pfn) high_pfn = pfn; @@ -1851,7 +1858,7 @@ static void isolate_freepages(struct compact_control *cc) * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct folio *compaction_alloc(struct folio *src, unsigned long data) +static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct folio *dst; @@ -1898,6 +1905,11 @@ again: return page_rmappable_folio(&dst->page); } +static struct folio *compaction_alloc(struct folio *src, unsigned long data) +{ + return alloc_hooks(compaction_alloc_noprof(src, data)); +} + /* * This is a migrate-callback that "frees" freepages back to the isolated * freelist. All pages on the freelist are from the same zone, so there is no @@ -3345,7 +3357,6 @@ static struct ctl_table vm_compaction[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static int __init kcompactd_init(void) diff --git a/mm/damon/core.c b/mm/damon/core.c index d6f7e14abd..e66823d6b1 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -346,6 +346,7 @@ static struct damos_quota *damos_quota_init(struct damos_quota *quota) quota->charged_from = 0; quota->charge_target_from = NULL; quota->charge_addr_from = 0; + quota->esz_bp = 0; return quota; } @@ -1497,12 +1498,14 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) return true; } -static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) +static int damos_get_wmark_metric_value(enum damos_wmark_metric metric, + unsigned long *metric_value) { switch (metric) { case DAMOS_WMARK_FREE_MEM_RATE: - return global_zone_page_state(NR_FREE_PAGES) * 1000 / + *metric_value = global_zone_page_state(NR_FREE_PAGES) * 1000 / totalram_pages(); + return 0; default: break; } @@ -1517,10 +1520,9 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) { unsigned long metric; - if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + if (damos_get_wmark_metric_value(scheme->wmarks.metric, &metric)) return 0; - metric = damos_wmark_metric_value(scheme->wmarks.metric); /* higher than high watermark or lower than low watermark */ if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { if (scheme->wmarks.activated) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5e6dc31207..18797c1b41 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -16,8 +16,8 @@ #include "../internal.h" #include "ops-common.h" -static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, - unsigned long addr, void *arg) +static bool damon_folio_mkold_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); @@ -31,33 +31,38 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, return true; } -static void damon_pa_mkold(unsigned long paddr) +static void damon_folio_mkold(struct folio *folio) { - struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); struct rmap_walk_control rwc = { - .rmap_one = __damon_pa_mkold, + .rmap_one = damon_folio_mkold_one, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!folio) - return; - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); - goto out; + return; } need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); if (need_lock && !folio_trylock(folio)) - goto out; + return; rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); -out: +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); + + if (!folio) + return; + + damon_folio_mkold(folio); folio_put(folio); } @@ -79,8 +84,8 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) } } -static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, - unsigned long addr, void *arg) +static bool damon_folio_young_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *arg) { bool *accessed = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); @@ -111,38 +116,44 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, return *accessed == false; } -static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +static bool damon_folio_young(struct folio *folio) { - struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); bool accessed = false; struct rmap_walk_control rwc = { .arg = &accessed, - .rmap_one = __damon_pa_young, + .rmap_one = damon_folio_young_one, .anon_lock = folio_lock_anon_vma_read, }; bool need_lock; - if (!folio) - return false; - if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) - accessed = false; + return false; else - accessed = true; - goto out; + return true; } need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); if (need_lock && !folio_trylock(folio)) - goto out; + return false; rmap_walk(folio, &rwc); if (need_lock) folio_unlock(folio); -out: + return accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +{ + struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); + bool accessed; + + if (!folio) + return false; + + accessed = damon_folio_young(folio); *folio_sz = folio_size(folio); folio_put(folio); return accessed; @@ -203,6 +214,11 @@ static bool __damos_pa_filter_out(struct damos_filter *filter, matched = filter->memcg_id == mem_cgroup_id(memcg); rcu_read_unlock(); break; + case DAMOS_FILTER_TYPE_YOUNG: + matched = damon_folio_young(folio); + if (matched) + damon_folio_mkold(folio); + break; default: break; } @@ -228,6 +244,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) { unsigned long addr, applied; LIST_HEAD(folio_list); + bool install_young_filter = true; + struct damos_filter *filter; + + /* check access in page level again by default */ + damos_for_each_filter(filter, s) { + if (filter->type == DAMOS_FILTER_TYPE_YOUNG) { + install_young_filter = false; + break; + } + } + if (install_young_filter) { + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + if (!filter) + return 0; + damos_add_filter(s, filter); + } for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); @@ -249,7 +281,9 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) put_folio: folio_put(folio); } - applied = reclaim_pages(&folio_list, false); + if (install_young_filter) + damos_destroy_filter(filter); + applied = reclaim_pages(&folio_list); cond_resched(); return applied * PAGE_SIZE; } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 53a90ac678..bea5bc5284 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -343,6 +343,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", + "young", "addr", "target", }; diff --git a/mm/debug.c b/mm/debug.c index c1c1a6a484..69e524c3e6 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -55,25 +55,17 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = 0; + int mapcount = atomic_read(&page->_mapcount); char *type = ""; - /* - * page->_mapcount space in struct page is used by slab pages to - * encode own info, and we must avoid calling page_folio() again. - */ - if (!folio_test_slab(folio)) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (folio_test_large(folio)) - mapcount += folio_entire_mapcount(folio); - } - + mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1; pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); if (folio_test_large(folio)) { - pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", folio_order(folio), + folio_mapcount(folio), folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); @@ -99,7 +91,8 @@ static void __dump_folio(struct folio *folio, struct page *page, */ pr_warn("%sflags: %pGp%s\n", type, &folio->flags, is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); - pr_warn("page_type: %pGt\n", &folio->page.page_type); + if (page_has_type(&folio->page)) + pr_warn("page_type: %pGt\n", &folio->page.page_type); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, @@ -180,9 +173,6 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { pr_emerg("mm %px task_size %lu\n" -#ifdef CONFIG_MMU - "get_unmapped_area %px\n" -#endif "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" @@ -208,9 +198,6 @@ void dump_mm(const struct mm_struct *mm) "def_flags: %#lx(%pGv)\n", mm, mm->task_size, -#ifdef CONFIG_MMU - mm->get_unmapped_area, -#endif mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index 6755f0c9d4..d46acf989d 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) } early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (order >= debug_guardpage_minorder()) return false; @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { __ClearPageGuard(page); - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 65c19025da..e4969fb54d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -30,6 +30,7 @@ #include <linux/start_kernel.h> #include <linux/sched/mm.h> #include <linux/io.h> +#include <linux/vmalloc.h> #include <asm/cacheflush.h> #include <asm/pgalloc.h> @@ -39,22 +40,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - * - * On s390 platform, the lower 4 bits are used to identify given page table - * entry type. But these bits might affect the ability to clear entries with - * pxx_clear() because of how dynamic page table folding works on s390. So - * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. Also avoid the 62nd bit on ppc64 that is - * used to mark a pte entry. */ -#define S390_SKIP_MASK GENMASK(3, 0) -#if __BITS_PER_LONG == 64 -#define PPC64_SKIP_MASK GENMASK(62, 62) -#else -#define PPC64_SKIP_MASK 0x0 -#endif -#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) -#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) struct pgtable_debug_args { @@ -510,8 +496,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PUD clear\n"); - pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pudp, pud); + WARN_ON(pud_none(pud)); pud_clear(args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); @@ -547,8 +532,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating P4D clear\n"); - p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*args->p4dp, p4d); + WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); @@ -581,8 +565,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) return; pr_debug("Validating PGD clear\n"); - pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pgdp, pgd); + WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); @@ -633,10 +616,8 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) if (WARN_ON(!args->ptep)) return; -#ifndef CONFIG_RISCV - pte = __pte(pte_val(pte) | RANDOM_ORVALUE); -#endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + WARN_ON(pte_none(pte)); flush_dcache_page(page); barrier(); ptep_clear(args->mm, args->vaddr, args->ptep); @@ -649,8 +630,7 @@ static void __init pmd_clear_tests(struct pgtable_debug_args *args) pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); - pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*args->pmdp, pmd); + WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); @@ -981,6 +961,7 @@ static void __init pmd_thp_tests(struct pgtable_debug_args *args) #ifndef __HAVE_ARCH_PMDP_INVALIDATE WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd)))); WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd)))); + WARN_ON(!pmd_leaf(pmd_mkinvalid(pmd_mkhuge(pmd)))); #endif /* __HAVE_ARCH_PMDP_INVALIDATE */ } diff --git a/mm/execmem.c b/mm/execmem.c new file mode 100644 index 0000000000..0c4b36bc6d --- /dev/null +++ b/mm/execmem.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + * Copyright (C) 2024 Mike Rapoport IBM. + */ + +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/execmem.h> +#include <linux/moduleloader.h> + +static struct execmem_info *execmem_info __ro_after_init; +static struct execmem_info default_execmem_info __ro_after_init; + +static void *__execmem_alloc(struct execmem_range *range, size_t size) +{ + bool kasan = range->flags & EXECMEM_KASAN_SHADOW; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; + unsigned long start = range->start; + unsigned long end = range->end; + unsigned int align = range->alignment; + pgprot_t pgprot = range->pgprot; + void *p; + + if (kasan) + vm_flags |= VM_DEFER_KMEMLEAK; + + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!p && range->fallback_start) { + start = range->fallback_start; + end = range->fallback_end; + p = __vmalloc_node_range(size, align, start, end, gfp_flags, + pgprot, vm_flags, NUMA_NO_NODE, + __builtin_return_address(0)); + } + + if (!p) { + pr_warn_ratelimited("execmem: unable to allocate memory\n"); + return NULL; + } + + if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { + vfree(p); + return NULL; + } + + return kasan_reset_tag(p); +} + +void *execmem_alloc(enum execmem_type type, size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[type]; + + return __execmem_alloc(range, size); +} + +void execmem_free(void *ptr) +{ + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); + vfree(ptr); +} + +static bool execmem_validate(struct execmem_info *info) +{ + struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; + + if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { + pr_crit("Invalid parameters for execmem allocator, module loading will fail"); + return false; + } + + return true; +} + +static void execmem_init_missing(struct execmem_info *info) +{ + struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; + + for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { + struct execmem_range *r = &info->ranges[i]; + + if (!r->start) { + if (i == EXECMEM_MODULE_DATA) + r->pgprot = PAGE_KERNEL; + else + r->pgprot = default_range->pgprot; + r->alignment = default_range->alignment; + r->start = default_range->start; + r->end = default_range->end; + r->flags = default_range->flags; + r->fallback_start = default_range->fallback_start; + r->fallback_end = default_range->fallback_end; + } + } +} + +struct execmem_info * __weak execmem_arch_setup(void) +{ + return NULL; +} + +static void __init __execmem_init(void) +{ + struct execmem_info *info = execmem_arch_setup(); + + if (!info) { + info = execmem_info = &default_execmem_info; + info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; + info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; + info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; + info->ranges[EXECMEM_DEFAULT].alignment = 1; + } + + if (!execmem_validate(info)) + return; + + execmem_init_missing(info); + + execmem_info = info; +} + +#ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE +static int __init execmem_late_init(void) +{ + __execmem_init(); + return 0; +} +core_initcall(execmem_late_init); +#else +void __init execmem_init(void) +{ + __execmem_init(); +} +#endif diff --git a/mm/filemap.c b/mm/filemap.c index 196f701665..657bcd887f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -168,7 +168,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { - int mapcount = page_mapcount(&folio->page); + int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* @@ -852,23 +852,18 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - bool huge = folio_test_hugetlb(folio); - bool charged = false; - long nr = 1; + void *alloced_shadow = NULL; + int alloced_order = 0; + bool huge; + long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); mapping_set_update(&xas, mapping); - if (!huge) { - int error = mem_cgroup_charge(folio, NULL, gfp); - if (error) - return error; - charged = true; - } - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); xas_set_order(&xas, index, folio_order(folio)); + huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; @@ -876,13 +871,10 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->mapping = mapping; folio->index = xas.xa_index; - do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1, split_order = 0; void *entry, *old = NULL; - if (order > folio_order(folio)) - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -890,19 +882,33 @@ noinline int __filemap_add_folio(struct address_space *mapping, xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); + } + + /* entry may have changed before we re-acquire the lock */ + if (alloced_order && (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; } if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); + if (!alloced_order) { + split_order = order; + goto unlock; + } xas_split(&xas, old, order); xas_reset(&xas); } + if (shadowp) + *shadowp = old; } xas_store(&xas, folio); @@ -918,9 +924,24 @@ noinline int __filemap_add_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } if (xas_error(&xas)) goto error; @@ -928,8 +949,6 @@ unlock: trace_mm_filemap_add_to_page_cache(folio); return 0; error: - if (charged) - mem_cgroup_uncharge(folio); folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ folio_put_refs(folio, nr); @@ -943,11 +962,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, void *shadow = NULL; int ret; + ret = mem_cgroup_charge(folio, NULL, gfp); + if (ret) + return ret; + __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); - if (unlikely(ret)) + if (unlikely(ret)) { + mem_cgroup_uncharge(folio); __folio_clear_locked(folio); - else { + } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like @@ -966,7 +990,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; @@ -976,14 +1000,14 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - folio = __folio_alloc_node(gfp, order, n); + folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } - return folio_alloc(gfp, order); + return folio_alloc_noprof(gfp, order); } -EXPORT_SYMBOL(filemap_alloc_folio); +EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* @@ -1540,7 +1564,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1553,8 +1577,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. @@ -1786,7 +1810,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily - * increased by a speculative page cache (or fast GUP) lookup as it can + * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by @@ -3482,7 +3506,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3493,7 +3517,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - (*mmap_miss)++; + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3508,6 +3540,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3522,6 +3555,7 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3534,7 +3568,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3542,7 +3576,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) return ret; - (*mmap_miss)++; + /* See comment of filemap_map_folio_range() */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3556,6 +3592,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); + (*rss)++; folio_ref_inc(folio); return ret; @@ -3572,7 +3609,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; + unsigned long rss = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3591,6 +3629,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_put(folio); goto out; } + + folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3602,15 +3642,16 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, - folio, addr, &mmap_miss); + folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &mmap_miss); + nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); + add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); @@ -4135,6 +4176,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) } EXPORT_SYMBOL(filemap_release_folio); +/** + * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache + * @inode: The inode to flush + * @flush: Set to write back rather than simply invalidate. + * @start: First byte to in range. + * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start + * onwards. + * + * Invalidate all the folios on an inode that contribute to the specified + * range, possibly writing them back first. Whilst the operation is + * undertaken, the invalidate lock is held to prevent new folios from being + * installed. + */ +int filemap_invalidate_inode(struct inode *inode, bool flush, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t first = start >> PAGE_SHIFT; + pgoff_t last = end >> PAGE_SHIFT; + pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; + + if (!mapping || !mapping->nrpages || end < start) + goto out; + + /* Prevent new folios from being added to the inode. */ + filemap_invalidate_lock(mapping); + + if (!mapping->nrpages) + goto unlock; + + unmap_mapping_pages(mapping, first, nr, false); + + /* Write back the data if we're asked to. */ + if (flush) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + filemap_fdatawrite_wbc(mapping, &wbc); + } + + /* Wait for writeback to complete on all folios and discard. */ + truncate_inode_pages_range(mapping, start, end); + +unlock: + filemap_invalidate_unlock(mapping); +out: + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL_GPL(filemap_invalidate_inode); + #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 50412014f1..f31e0ce65b 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -58,12 +58,6 @@ bool set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); -int __set_page_dirty_nobuffers(struct page *page) -{ - return filemap_dirty_folio(page_mapping(page), page_folio(page)); -} -EXPORT_SYMBOL(__set_page_dirty_nobuffers); - bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); @@ -89,7 +89,7 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); goto retry; } @@ -97,95 +97,6 @@ retry: return folio; } -/** - * try_grab_folio() - Attempt to get or pin a folio. - * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the folio's refcount - * @flags: gup flags: these are the FOLL_* flag values. - * - * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. - * - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the - * same time. (That's true throughout the get_user_pages*() and - * pin_user_pages*() APIs.) Cases: - * - * FOLL_GET: folio's refcount will be incremented by @refs. - * - * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its pincount will be incremented by @refs. - * - * FOLL_PIN on single-page folios: folio's refcount will be incremented by - * @refs * GUP_PIN_COUNTING_BIAS. - * - * Return: The folio containing @page (with refcount appropriately - * incremented) for success, or NULL upon failure. If neither FOLL_GET - * nor FOLL_PIN was set, that's considered failure, and furthermore, - * a likely bug in the caller, so a warning is also emitted. - */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) -{ - struct folio *folio; - - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) - return NULL; - - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) - return NULL; - - if (flags & FOLL_GET) - return try_get_folio(page, refs); - - /* FOLL_PIN is set */ - - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); - - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_page_refs(&folio->page, refs)) - folio_put_refs(folio, refs); - return NULL; - } - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in folio_try_share_anon_rmap_*(). - */ - smp_mb__after_atomic(); - - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - - return folio; -} - static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { @@ -198,63 +109,64 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_page_refs(&folio->page, refs)) + if (!put_devmap_managed_folio_refs(folio, refs)) folio_put_refs(folio, refs); } /** - * try_grab_page() - elevate a page's refcount by a flag-dependent amount - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. + * try_grab_folio() - add a folio's refcount by a flag-dependent amount + * @folio: pointer to folio to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_folio() documentation, with - * "refs=1". + * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. + * + * It is called when we have a stable reference for the folio, typically in + * GUP slow path. */ -int __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags) { - struct folio *folio = page_folio(page); - if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) return -EREMOTEIO; if (flags & FOLL_GET) - folio_ref_inc(folio); + folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ - if (is_zero_page(page)) + if (is_zero_folio(folio)) return 0; /* - * Similar to try_grab_folio(): be sure to *also* - * increment the normal page refcount field at least once, + * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { - folio_ref_add(folio, 1); - atomic_add(1, &folio->_pincount); + folio_ref_add(folio, refs); + atomic_add(refs, &folio->_pincount); } else { - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; @@ -440,7 +352,7 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); -static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) +static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; @@ -500,22 +412,446 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) } #ifdef CONFIG_MMU + +#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST) +static int record_subpages(struct page *page, unsigned long sz, + unsigned long addr, unsigned long end, + struct page **pages) +{ + struct page *start_page; + int nr; + + start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); + for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) + pages[nr] = nth_page(start_page, nr); + + return nr; +} + +/** + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + * + * It uses add ref unless zero to elevate the folio refcount and must be called + * in fast path only. + */ +static struct folio *try_grab_folio_fast(struct page *page, int refs, + unsigned int flags) +{ + struct folio *folio; + + /* Raise warn if it is not called in fast GUP */ + VM_WARN_ON_ONCE(!irqs_disabled()); + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + + if (flags & FOLL_GET) + return try_get_folio(page, refs); + + /* FOLL_PIN is set */ + + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_folio_refs(folio, refs)) + folio_put_refs(folio, refs); + return NULL; + } + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in folio_try_share_anon_rmap_*(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ + +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +/* + * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed. + * + * NOTE: for the same entry, gup-fast and gup-slow can return different + * results (0 v.s. -EMLINK) depending on whether vma is available. This is + * the expected behavior, where we simply want gup-fast to fallback to + * gup-slow to take the vma reference first. + */ +static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, + unsigned long addr, unsigned long end, unsigned int flags, + struct page **pages, int *nr, bool fast) +{ + unsigned long pte_end; + struct page *page; + struct folio *folio; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = huge_ptep_get(ptep); + + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + page = pte_page(pte); + refs = record_subpages(page, sz, addr, end, pages + *nr); + + if (fast) { + folio = try_grab_folio_fast(page, refs, flags); + if (!folio) + return 0; + } else { + folio = page_folio(page); + if (try_grab_folio(folio, refs, flags)) + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pte_write(pte) && gup_must_unshare(vma, flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return -EMLINK; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +/* + * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file + * systems on Power, which does not have issue with folio writeback against + * GUP updates. When hugepd will be extended to support non-hugetlbfs or + * even anonymous memory, we need to do extra check as what we do with most + * of the other folios. See writable_file_mapping_allowed() and + * gup_fast_folio_allowed() for more information. + */ +static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned long end, unsigned int flags, + struct page **pages, int *nr, bool fast) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + int ret; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, + fast); + if (ret != 1) + return ret; + } while (ptep++, addr = next, addr != end); + + return 1; +} + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + struct page *page; + struct hstate *h; + spinlock_t *ptl; + int nr = 0, ret; + pte_t *ptep; + + /* Only hugetlb supports hugepd */ + if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma))) + return ERR_PTR(-EFAULT); + + h = hstate_vma(vma); + ptep = hugepte_offset(hugepd, addr, pdshift); + ptl = huge_pte_lock(h, vma->vm_mm, ptep); + ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, + flags, &page, &nr, false); + spin_unlock(ptl); + + if (ret == 1) { + /* GUP succeeded */ + WARN_ON_ONCE(nr != 1); + ctx->page_mask = (1U << huge_page_order(h)) - 1; + return page; + } + + /* ret can be either 0 (translates to NULL) or negative */ + return ERR_PTR(ret); +} +#else /* CONFIG_ARCH_HAS_HUGEPD */ +static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned long end, unsigned int flags, + struct page **pages, int *nr, bool fast) +{ + return 0; +} + +static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, + unsigned long addr, unsigned int pdshift, + unsigned int flags, + struct follow_page_context *ctx) +{ + return NULL; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + + static struct page *no_page_table(struct vm_area_struct *vma, - unsigned int flags) + unsigned int flags, unsigned long address) { + if (!(flags & FOLL_DUMP)) + return NULL; + /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or + * When core dumping, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ - if ((flags & FOLL_DUMP) && - (vma_is_anonymous(vma) || !vma->vm_ops->fault)) + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); + + if (!hugetlbfs_pagecache_present(h, vma, address)) + return ERR_PTR(-EFAULT); + } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) { return ERR_PTR(-EFAULT); + } + + return NULL; +} + +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +static struct page *follow_huge_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, + int flags, struct follow_page_context *ctx) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pud_t pud = *pudp; + unsigned long pfn = pud_pfn(pud); + int ret; + + assert_spin_locked(pud_lockptr(mm, pudp)); + + if ((flags & FOLL_WRITE) && !pud_write(pud)) + return NULL; + + if (!pud_present(pud)) + return NULL; + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + pud_devmap(pud)) { + /* + * device mapped pages can only be returned if the caller + * will manage the page reference count. + * + * At least one of FOLL_GET | FOLL_PIN must be set, so + * assert that here: + */ + if (!(flags & (FOLL_GET | FOLL_PIN))) + return ERR_PTR(-EEXIST); + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pudp, flags & FOLL_WRITE); + + ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); + if (!ctx->pgmap) + return ERR_PTR(-EFAULT); + } + + page = pfn_to_page(pfn); + + if (!pud_devmap(pud) && !pud_write(pud) && + gup_must_unshare(vma, flags, page)) + return ERR_PTR(-EMLINK); + + ret = try_grab_folio(page_folio(page), 1, flags); + if (ret) + page = ERR_PTR(ret); + else + ctx->page_mask = HPAGE_PUD_NR - 1; + + return page; +} + +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); +} + +static struct page *follow_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags, + struct follow_page_context *ctx) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t pmdval = *pmd; + struct page *page; + int ret; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + page = pmd_page(pmdval); + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(pmdval, page, vma, flags)) + return NULL; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval)) + return ERR_PTR(-EFAULT); + + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) + return NULL; + + if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) + return ERR_PTR(-EMLINK); + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + + ret = try_grab_folio(page_folio(page), 1, flags); + if (ret) + return ERR_PTR(ret); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH)) + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + ctx->page_mask = HPAGE_PMD_NR - 1; + + return page; +} + +#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ +static struct page *follow_huge_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, + int flags, struct follow_page_context *ctx) +{ + return NULL; +} + +static struct page *follow_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags, + struct follow_page_context *ctx) +{ return NULL; } +#endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) @@ -593,7 +929,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; @@ -647,8 +983,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_page(page, flags); + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ + ret = try_grab_folio(page_folio(page), 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -685,7 +1021,7 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, @@ -701,42 +1037,45 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmd = pmd_offset(pudp, address); pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval))))) + return follow_hugepd(vma, __hugepd(pmd_val(pmdval)), + address, PMD_SHIFT, flags, ctx); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (likely(!pmd_trans_huge(pmdval))) + if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_present(*pmd))) { + pmdval = *pmd; + if (unlikely(!pmd_present(pmdval))) { spin_unlock(ptl); - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (unlikely(!pmd_trans_huge(*pmd))) { + if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT_PMD) { + if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); + page = follow_huge_pmd(vma, address, pmd, flags, ctx); spin_unlock(ptl); - ctx->page_mask = HPAGE_PMD_NR - 1; return page; } @@ -745,26 +1084,30 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned int flags, struct follow_page_context *ctx) { - pud_t *pud; + pud_t *pudp, pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - pud = pud_offset(p4dp, address); - if (pud_none(*pud)) - return no_page_table(vma, flags); - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) + return no_page_table(vma, flags, address); + if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) + return follow_hugepd(vma, __hugepd(pud_val(pud)), + address, PUD_SHIFT, flags, ctx); + if (pud_leaf(pud)) { + ptl = pud_lock(mm, pudp); + page = follow_huge_pud(vma, address, pudp, flags, ctx); spin_unlock(ptl); if (page) return page; - return no_page_table(vma, flags); + return no_page_table(vma, flags, address); } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); + if (unlikely(pud_bad(pud))) + return no_page_table(vma, flags, address); - return follow_pmd_mask(vma, address, pud, flags, ctx); + return follow_pmd_mask(vma, address, pudp, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, @@ -772,16 +1115,20 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned int flags, struct follow_page_context *ctx) { - p4d_t *p4d; + p4d_t *p4dp, p4d; - p4d = p4d_offset(pgdp, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + BUILD_BUG_ON(p4d_leaf(p4d)); + + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) + return follow_hugepd(vma, __hugepd(p4d_val(p4d)), + address, P4D_SHIFT, flags, ctx); - return follow_pud_mask(vma, address, p4d, flags, ctx); + if (!p4d_present(p4d) || p4d_bad(p4d)) + return no_page_table(vma, flags, address); + + return follow_pud_mask(vma, address, p4dp, flags, ctx); } /** @@ -814,24 +1161,24 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; + struct page *page; - ctx->page_mask = 0; - - /* - * Call hugetlb_follow_page_mask for hugetlb vmas as it will use - * special hugetlb page table walking code. This eliminates the - * need to check for hugetlb entries in the general walking code. - */ - if (is_vm_hugetlb_page(vma)) - return hugetlb_follow_page_mask(vma, address, flags, - &ctx->page_mask); + vma_pgtable_walk_begin(vma); + ctx->page_mask = 0; pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); + if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd))))) + page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)), + address, PGDIR_SHIFT, flags, ctx); + else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + page = no_page_table(vma, flags, address); + else + page = follow_p4d_mask(vma, address, pgd, flags, ctx); + + vma_pgtable_walk_end(vma); - return follow_p4d_mask(vma, address, pgd, flags, ctx); + return page; } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, @@ -901,7 +1248,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(entry); } - ret = try_grab_page(*page, gup_flags); + ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: @@ -1304,20 +1651,19 @@ next_page: * pages. */ if (page_increm > 1) { - struct folio *folio; + struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ - folio = try_grab_folio(page, page_increm - 1, - foll_flags); - if (WARN_ON_ONCE(!folio)) { + if (try_grab_folio(folio, page_increm - 1, + foll_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(page_folio(page), 1, + gup_put_folio(folio, 1, foll_flags); ret = -EFAULT; goto out; @@ -2144,6 +2490,7 @@ static int migrate_longterm_unpinnable_pages( struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, + .reason = MR_LONGTERM_PIN, }; if (migrate_pages(movable_page_list, alloc_migration_target, @@ -2431,7 +2778,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_unlocked); /* - * Fast GUP + * GUP-fast * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -2445,7 +2792,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those - * pages. Disabling interrupts will allow the fast_gup walker to both block + * pages. Disabling interrupts will allow the gup_fast() walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * @@ -2463,15 +2810,16 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_FAST_GUP - +#ifdef CONFIG_HAVE_GUP_FAST /* - * Used in the GUP-fast path to determine whether a pin is permitted for a - * specific folio. + * Used in the GUP-fast path to determine whether GUP is permitted to work on + * a specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * + * GUP-fast must reject all secretmem folios. + * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing @@ -2481,25 +2829,34 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ -static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) +static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { + bool reject_file_backed = false; struct address_space *mapping; + bool check_secretmem = false; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ - if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != + if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) == (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) - return true; + reject_file_backed = true; + + /* We hold a folio reference, so we can safely access folio fields. */ - /* The folio is pinned, so we can safely access folio fields. */ + /* secretmem folios are always order-0 folios. */ + if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) + check_secretmem = true; + + if (!reject_file_backed && !check_secretmem) + return true; if (WARN_ON_ONCE(folio_test_slab(folio))) return false; - /* hugetlb mappings do not require dirty-tracking. */ + /* hugetlb neither requires dirty-tracking nor can be secretmem. */ if (folio_test_hugetlb(folio)) return true; @@ -2535,50 +2892,48 @@ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) /* * At this point, we know the mapping is non-null and points to an - * address_space object. The only remaining whitelisted file system is - * shmem. + * address_space object. */ - return shmem_mapping(mapping); + if (check_secretmem && secretmem_mapping(mapping)) + return false; + /* The only remaining allowed file system is shmem. */ + return !reject_file_backed || shmem_mapping(mapping); } -static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, - unsigned int flags, - struct page **pages) +static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, + unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; + struct folio *folio = page_folio(pages[--(*nr)]); - ClearPageReferenced(page); - if (flags & FOLL_PIN) - unpin_user_page(page); - else - put_page(page); + folio_clear_referenced(folio); + gup_put_folio(folio, 1, flags); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* - * Fast-gup relies on pte change detection to avoid concurrent pgtable + * GUP-fast relies on pte change detection to avoid concurrent pgtable * operations. * - * To pin the page, fast-gup needs to do below in order: + * To pin the page, GUP-fast needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy - * with fast-gup, we need to do (1) clear pte, then (2) check whether page + * with GUP-fast, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * - * For THP collapse, it's a bit more complicated because fast-gup may be + * For THP collapse, it's a bit more complicated because GUP-fast may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ -static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -2611,7 +2966,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) @@ -2620,22 +2975,17 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; - if (unlikely(folio_is_secretmem(folio))) { - gup_put_folio(folio, 1, flags); - goto pte_unmap; - } - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2680,44 +3030,45 @@ pte_unmap: * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still - * useful to have gup_huge_pmd even if we can't operate on ptes. + * useful to have gup_fast_pmd_leaf even if we can't operate on ptes. */ -static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { + struct folio *folio; struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } - SetPageReferenced(page); - pages[*nr] = page; - if (unlikely(try_grab_page(page, flags))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + folio = try_grab_folio_fast(page, 1, flags); + if (!folio) { + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } + folio_set_referenced(folio); + pages[*nr] = page; (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -2726,156 +3077,62 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, return addr == end; } -static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } -static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { - undo_dev_pagemap(nr, nr_start, flags, pages); + gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else -static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { BUILD_BUG(); return 0; } -static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { BUILD_BUG(); return 0; } #endif -static int record_subpages(struct page *page, unsigned long addr, - unsigned long end, struct page **pages) -{ - int nr; - - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(page, nr); - - return nr; -} - -#ifdef CONFIG_ARCH_HAS_HUGEPD -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - -static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - unsigned long pte_end; - struct page *page; - struct folio *folio; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = huge_ptep_get(ptep); - - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); - - folio = try_grab_folio(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!folio_fast_pin_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} - -static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} -#else -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) -{ - return 0; -} -#endif /* CONFIG_ARCH_HAS_HUGEPD */ - -static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct page *page; struct folio *folio; @@ -2887,14 +3144,14 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, - pages, nr); + return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, + pages, nr); } - page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pmd_page(orig); + refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2903,7 +3160,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2917,9 +3174,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 1; } -static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { struct page *page; struct folio *folio; @@ -2931,14 +3188,14 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; - return __gup_device_huge_pud(orig, pudp, addr, end, flags, - pages, nr); + return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, + pages, nr); } - page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pud_page(orig); + refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2947,7 +3204,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2962,9 +3219,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 1; } -static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, - struct page **pages, int *nr) +static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { int refs; struct page *page; @@ -2975,10 +3232,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); - page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); - refs = record_subpages(page, addr, end, pages + *nr); + page = pgd_page(orig); + refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -2992,7 +3249,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - if (!folio_fast_pin_allowed(folio, flags)) { + if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } @@ -3002,8 +3259,9 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 1; } -static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; pmd_t *pmdp; @@ -3016,13 +3274,12 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || - pmd_devmap(pmd))) { - /* See gup_pte_range() */ + if (unlikely(pmd_leaf(pmd))) { + /* See gup_fast_pte_range() */ if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, + if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -3031,18 +3288,21 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo * architecture have different format for hugetlbfs * pmd format and THP pmd format */ - if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, flags, pages, nr, + true) != 1) return 0; - } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) + } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, + pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } -static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; pud_t *pudp; @@ -3054,23 +3314,26 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud) || pud_devmap(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, flags, - pages, nr)) + if (unlikely(pud_leaf(pud))) { + if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags, + pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { - if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, + PUD_SHIFT, next, flags, pages, nr, + true) != 1) return 0; - } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) + } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, + pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } -static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, + unsigned long end, unsigned int flags, struct page **pages, + int *nr) { unsigned long next; p4d_t *p4dp; @@ -3080,21 +3343,23 @@ static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned lo p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) + if (!p4d_present(p4d)) return 0; - BUILD_BUG_ON(p4d_huge(p4d)); + BUILD_BUG_ON(p4d_leaf(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { - if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, + P4D_SHIFT, next, flags, pages, nr, + true) != 1) return 0; - } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) + } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, + pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } -static void gup_pgd_range(unsigned long addr, unsigned long end, +static void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; @@ -3107,24 +3372,26 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, - pages, nr)) + if (unlikely(pgd_leaf(pgd))) { + if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, + pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { - if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr)) + if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, flags, pages, nr, + true) != 1) return; - } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) + } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else -static inline void gup_pgd_range(unsigned long addr, unsigned long end, +static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* @@ -3137,16 +3404,14 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static unsigned long lockless_pages_from_mm(unsigned long start, - unsigned long end, - unsigned int gup_flags, - struct page **pages) +static unsigned long gup_fast(unsigned long start, unsigned long end, + unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; - if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) || !gup_fast_permitted(start, end)) return 0; @@ -3168,16 +3433,16 @@ static unsigned long lockless_pages_from_mm(unsigned long start, * that come from THPs splitting. */ local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect - * from fork() via copy_page_range(), in this case always fail fast GUP. + * from fork() via copy_page_range(), in this case always fail GUP-fast. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { - unpin_user_pages_lockless(pages, nr_pinned); + gup_fast_unpin_user_pages(pages, nr_pinned); return 0; } else { sanity_check_pinned_pages(pages, nr_pinned); @@ -3186,10 +3451,8 @@ static unsigned long lockless_pages_from_mm(unsigned long start, return nr_pinned; } -static int internal_get_user_pages_fast(unsigned long start, - unsigned long nr_pages, - unsigned int gup_flags, - struct page **pages) +static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; @@ -3217,7 +3480,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; @@ -3271,7 +3534,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); @@ -3302,7 +3565,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -3330,7 +3593,7 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; - return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); + return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); @@ -424,22 +424,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, walk->action = ACTION_CONTINUE; pud = READ_ONCE(*pudp); - if (pud_none(pud)) { + if (!pud_present(pud)) { spin_unlock(ptl); return hmm_vma_walk_hole(start, end, -1, walk); } - if (pud_huge(pud) && pud_devmap(pud)) { + if (pud_leaf(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; unsigned long *hmm_pfns; unsigned long cpu_flags; - if (!pud_present(pud)) { - spin_unlock(ptl); - return hmm_vma_walk_hole(start, end, -1, walk); - } - i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; hmm_pfns = &range->hmm_pfns[i]; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 769e8a125f..374a0d54b0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,7 @@ #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/compat.h> +#include <linux/pgalloc_tag.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -73,20 +74,31 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); static atomic_t huge_zero_refcount; -struct page *huge_zero_page __read_mostly; +struct folio *huge_zero_folio __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + unsigned long supported_orders; + /* Check the intersection of requested and supported orders. */ - orders &= vma_is_anonymous(vma) ? - THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (vma_is_anonymous(vma)) + supported_orders = THP_ORDERS_ALL_ANON; + else if (vma_is_dax(vma)) + supported_orders = THP_ORDERS_ALL_FILE_DAX; + else + supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; + + orders &= supported_orders; if (!orders) return 0; @@ -191,24 +203,24 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static bool get_huge_zero_page(void) { - struct page *zero_page; + struct folio *zero_folio; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); - if (!zero_page) { + if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } preempt_disable(); - if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { preempt_enable(); - __free_pages(zero_page, compound_order(zero_page)); + folio_put(zero_folio); goto retry; } - WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); + WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); @@ -226,10 +238,10 @@ static void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } -struct page *mm_get_huge_zero_page(struct mm_struct *mm) +struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - return READ_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_folio); if (!get_huge_zero_page()) return NULL; @@ -237,10 +249,10 @@ struct page *mm_get_huge_zero_page(struct mm_struct *mm) if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); - return READ_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_folio); } -void mm_put_huge_zero_page(struct mm_struct *mm) +void mm_put_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); @@ -257,10 +269,10 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { - struct page *zero_page = xchg(&huge_zero_page, NULL); - BUG_ON(zero_page == NULL); + struct folio *zero_folio = xchg(&huge_zero_folio, NULL); + BUG_ON(zero_folio == NULL); WRITE_ONCE(huge_zero_pfn, ~0UL); - __free_pages(zero_page, compound_order(zero_page)); + folio_put(zero_folio); return HPAGE_PMD_NR; } @@ -525,6 +537,52 @@ static const struct kobj_type thpsize_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; + +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct mthp_stat *this = &per_cpu(mthp_stats, cpu); + + sum += this->stats[order][item]; + } + + return sum; +} + +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + int order = to_thpsize(kobj)->order; \ + \ + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ +} \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); +DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); + +static struct attribute *stats_attrs[] = { + &anon_fault_alloc_attr.attr, + &anon_fault_fallback_attr.attr, + &anon_fault_fallback_charge_attr.attr, + &swpout_attr.attr, + &swpout_fallback_attr.attr, + NULL, +}; + +static struct attribute_group stats_attr_group = { + .name = "stats", + .attrs = stats_attrs, +}; + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; @@ -548,6 +606,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) return ERR_PTR(ret); } + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + thpsize->order = order; return thpsize; } @@ -684,11 +748,6 @@ static int __init hugepage_init(void) * hugepages can't be allocated by the buddy allocator */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); - /* - * we use page->mapping and page->index in second tail page - * as list_head: assuming THP order >= 2 - */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); err = hugepage_init_sysfs(&hugepage_kobj); if (err) @@ -788,33 +847,25 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -void folio_prep_large_rmappable(struct folio *folio) -{ - if (!folio || !folio_test_large(folio)) - return; - if (folio_order(folio) > 1) - INIT_LIST_HEAD(&folio->_deferred_list); - folio_set_large_rmappable(folio); -} - -static inline bool is_transparent_hugepage(struct folio *folio) +static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) return false; - return is_huge_zero_page(&folio->page) || + return is_huge_zero_folio(folio) || folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, - loff_t off, unsigned long flags, unsigned long size) + loff_t off, unsigned long flags, unsigned long size, + vm_flags_t vm_flags) { loff_t off_end = off + len; loff_t off_align = round_up(off, size); unsigned long len_pad, ret, off_sub; - if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall()) + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) return 0; if (off_end <= off_align || (off_end - off_align) < size) @@ -824,8 +875,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = current->mm->get_unmapped_area(filp, addr, len_pad, - off >> PAGE_SHIFT, flags); + ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + off >> PAGE_SHIFT, flags, vm_flags); /* * The failure might be due to length padding. The caller will retry @@ -843,25 +894,32 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown && - !off_sub) + if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) return ret + size; ret += off_sub; return ret; } -unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) { unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; - ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); if (ret) return ret; - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + vm_flags); +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); @@ -880,6 +938,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); @@ -929,6 +989,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } @@ -979,14 +1040,14 @@ gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) } /* Caller must hold page table lock. */ -static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *zero_page) + struct folio *zero_folio) { pmd_t entry; if (!pmd_none(*pmd)) return; - entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -999,24 +1060,27 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; - struct page *zero_page; + struct folio *zero_folio; vm_fault_t ret; + pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = mm_get_huge_zero_page(vma->vm_mm); - if (unlikely(!zero_page)) { + zero_folio = mm_get_huge_zero_folio(vma->vm_mm); + if (unlikely(!zero_folio)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1034,8 +1098,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { - set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, vmf->pmd, zero_page); + set_huge_zero_folio(pgtable, vma->vm_mm, vma, + haddr, vmf->pmd, zero_folio); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } @@ -1049,6 +1113,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); @@ -1228,8 +1293,8 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, bool write) +void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write) { pmd_t _pmd; @@ -1274,7 +1339,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); @@ -1344,11 +1409,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { /* - * get_huge_zero_page() will never allocate a new page here, - * since we already have a zero page to copy. It just takes a - * reference. + * mm_get_huge_zero_folio() will never allocate a new + * folio here, since we already have a zero page to + * copy. It just takes a reference. */ - mm_get_huge_zero_page(dst_mm); + mm_get_huge_zero_folio(dst_mm); goto out_zero_page; } @@ -1385,8 +1450,8 @@ out: } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void touch_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, bool write) +void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write) { pud_t _pud; @@ -1398,49 +1463,6 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr, update_mmu_cache_pud(vma, addr, pud); } -struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags, struct dev_pagemap **pgmap) -{ - unsigned long pfn = pud_pfn(*pud); - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pud_lockptr(mm, pud)); - - if (flags & FOLL_WRITE && !pud_write(*pud)) - return NULL; - - if (pud_present(*pud) && pud_devmap(*pud)) - /* pass */; - else - return NULL; - - if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pud, flags & FOLL_WRITE); - - /* - * device mapped pages can only be returned if the - * caller will manage the page reference count. - * - * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - *pgmap = get_dev_pagemap(pfn, *pgmap); - if (!*pgmap) - return ERR_PTR(-EFAULT); - page = pfn_to_page(pfn); - - ret = try_grab_page(page, flags); - if (ret) - page = ERR_PTR(ret); - - return page; -} - int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma) @@ -1627,88 +1649,6 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, return pmd_dirty(pmd); } -/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ -static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, - struct vm_area_struct *vma, - unsigned int flags) -{ - /* If the pmd is writable, we can write to the page. */ - if (pmd_write(pmd)) - return true; - - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) - return false; - - /* ... and a write-fault isn't required for other reasons. */ - if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) - return false; - return !userfaultfd_huge_pmd_wp(vma, pmd); -} - -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) -{ - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int ret; - - assert_spin_locked(pmd_lockptr(mm, pmd)); - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); - - if ((flags & FOLL_WRITE) && - !can_follow_write_pmd(*pmd, page, vma, flags)) - return NULL; - - /* Avoid dumping huge zero page */ - if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) - return ERR_PTR(-EFAULT); - - if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) - return NULL; - - if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) - return ERR_PTR(-EMLINK); - - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); - - ret = try_grab_page(page, flags); - if (ret) - return ERR_PTR(ret); - - if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); - - page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); - - return page; -} - /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { @@ -1754,7 +1694,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) */ if (node_is_toptier(nid)) last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); + target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; @@ -1824,12 +1764,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) @@ -1915,7 +1855,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio = page_folio(page); folio_remove_rmap_pmd(folio, page, vma); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; @@ -2094,7 +2034,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa @@ -2268,7 +2208,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm } folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_folio->index = linear_page_index(dst_vma, dst_addr); _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ @@ -2674,7 +2614,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * It's safe to call pmd_page when folio is set because it's * guaranteed that pmd is present. */ - if (folio && folio != page_folio(pmd_page(*pmd))) + if (folio && folio != pmd_folio(*pmd)) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); } @@ -2866,7 +2806,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, clear_compound_head(page_tail); if (new_order) { prep_compound_page(page_tail, new_order); - folio_prep_large_rmappable(new_folio); + folio_set_large_rmappable(new_folio); } /* Finally unfreeze refcount. Additional reference from page cache. */ @@ -2949,6 +2889,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, order, new_order); + pgalloc_tag_split(head, 1 << order); /* See comment in __split_huge_page_tail() */ if (folio_test_anon(folio)) { @@ -2970,9 +2911,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, shmem_uncharge(folio->mapping->host, nr_dropped); remap_page(folio, nr); - if (folio_test_swapcache(folio)) - split_swap_cluster(folio->swap); - /* * set page to its compound_head when split to non order-0 pages, so * we can skip unlocking it below, since PG_locked is transferred to @@ -3016,28 +2954,48 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) } /* - * This function splits huge page into pages in @new_order. @page can point to - * any subpage of huge page to split. Split doesn't change the position of - * @page. + * This function splits a large folio into smaller folios of order @new_order. + * @page can point to any page of the large folio to split. The split operation + * does not change the position of @page. + * + * Prerequisites: + * + * 1) The caller must hold a reference on the @page's owning folio, also known + * as the large folio. + * + * 2) The large folio must be locked. * - * NOTE: order-1 anonymous folio is not supported because _deferred_list, - * which is used by partially mapped folios, is stored in subpage 2 and an - * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK, - * since they do not use _deferred_list. + * 3) The folio must not be pinned. Any unexpected folio references, including + * GUP pins, will result in the folio not getting split; instead, the caller + * will receive an -EAGAIN. * - * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. - * The huge page must be locked. + * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not + * supported for non-file-backed folios, because folio->_deferred_list, which + * is used by partially mapped folios, is stored in subpage 2, but an order-1 + * folio only has subpages 0 and 1. File-backed order-1 folios are supported, + * since they do not use _deferred_list. + * + * After splitting, the caller's folio reference will be transferred to @page, + * resulting in a raised refcount of @page after this call. The other pages may + * be freed if they are not mapped. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * - * Pages in new_order will inherit mapping, flags, and so on from the hugepage. + * Pages in @new_order will inherit the mapping, flags, and so on from the + * huge page. + * + * Returns 0 if the huge page was split successfully. + * + * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if + * the folio was concurrently removed from the page cache. * - * GUP pin and PG_locked transferred to @page or the compound page @page belongs - * to. Rest subpages can be freed if they are not mapped. + * Returns -EBUSY when trying to split the huge zeropage, if the folio is + * under writeback, if fs-specific folio metadata cannot currently be + * released, or if some unexpected race happened (e.g., anon VMA disappeared, + * truncation). * - * Returns 0 if the hugepage is split successfully. - * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under - * us. + * Returns -EINVAL when trying to split to an order that is incompatible + * with the folio. Splitting to order 0 is compatible with all folios. */ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) @@ -3048,6 +3006,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + bool is_thp = folio_test_pmd_mappable(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -3089,7 +3048,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_test_swapcache(folio) && new_order) return -EINVAL; - is_hzp = is_huge_zero_page(&folio->page); + is_hzp = is_huge_zero_folio(folio); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; @@ -3232,7 +3191,8 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + if (is_thp) + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } @@ -3294,7 +3254,8 @@ void deferred_split_folio(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(&folio->_deferred_list)) { - count_vm_event(THP_DEFERRED_SPLIT_PAGE); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c445e6fd85..92a2e8dcb7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, struct page *p; atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { @@ -1619,27 +1619,16 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif -static inline void __clear_hugetlb_destructor(struct hstate *h, - struct folio *folio) -{ - lockdep_assert_held(&hugetlb_lock); - - __folio_clear_hugetlb(folio); -} - /* * Remove hugetlb folio from lists. - * If vmemmap exists for the folio, update dtor so that the folio appears - * as just a compound page. Otherwise, wait until after allocating vmemmap - * to update dtor. - * - * A reference is held on the folio, except in the case of demote. + * If vmemmap exists for the folio, clear the hugetlb flag so that the + * folio appears as just a compound page. Otherwise, wait until after + * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus, - bool demote) +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1653,6 +1642,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1662,40 +1652,20 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * We can only clear the hugetlb destructor after allocating vmemmap + * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) - __clear_hugetlb_destructor(h, folio); - - /* - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. - */ - if (!demote) - folio_ref_unfreeze(folio, 1); + __folio_clear_hugetlb(folio); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, false); -} - -static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, true); -} - static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - int zeroed; int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); @@ -1719,29 +1689,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, */ folio_set_hugetlb_vmemmap_optimized(folio); - /* - * This folio is about to be managed by the hugetlb allocator and - * should have no users. Drop our reference, and check for others - * just in case. - */ - zeroed = folio_put_testzero(folio); - if (unlikely(!zeroed)) - /* - * It is VERY unlikely soneone else has taken a ref - * on the folio. In this case, we simply return as - * free_huge_folio() will be called when this other ref - * is dropped. - */ - return; - - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); + bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1754,11 +1709,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, return; /* - * If folio is not vmemmap optimized (!clear_dtor), then the folio + * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { + if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1771,23 +1726,25 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, } /* - * Move PageHWPoison flag from head page to the raw error pages, - * which makes any healthy subpages reusable. - */ - if (unlikely(folio_test_hwpoison(folio))) - folio_clear_hugetlb_hwpoison(folio); - - /* * If vmemmap pages were allocated above, then we need to clear the - * hugetlb destructor under the hugetlb lock. + * hugetlb flag under the hugetlb lock. */ if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(folio_test_hwpoison(folio))) + folio_clear_hugetlb_hwpoison(folio); + + folio_ref_unfreeze(folio, 1); + + /* * Non-gigantic pages demoted from CMA allocated gigantic pages * need to be given back to CMA in free_gigantic_folio. */ @@ -1796,7 +1753,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(&folio->page, huge_page_order(h)); + INIT_LIST_HEAD(&folio->_deferred_list); + folio_put(folio); } } @@ -1884,7 +1842,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1909,7 +1867,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1942,14 +1900,14 @@ retry: * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call - * __clear_hugetlb_destructor as this was done previously. + * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } @@ -1974,7 +1932,7 @@ void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the - * compound page destructor. + * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); @@ -2031,7 +1989,7 @@ void free_huge_folio(struct folio *folio) spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -2124,10 +2082,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, set_compound_head(p, &folio->page); } __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the destructor */ + /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_pincount, 0); return true; @@ -2162,13 +2120,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { - struct address_space *mapping = page_mapping(hpage); + struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; @@ -2184,13 +2142,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); - struct page *page; + struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* - * By default we always try hard to allocate the page with - * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * By default we always try hard to allocate the folio with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. @@ -2203,43 +2161,45 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - page = __alloc_pages(gfp_mask, order, nid, nmask); + folio = __folio_alloc(gfp_mask, order, nid, nmask); + /* Ensure hugetlb folio won't have large_rmappable flag set. */ + if (folio) + folio_clear_large_rmappable(folio); - /* Freeze head page */ - if (page && !page_ref_freeze(page, 1)) { - __free_pages(page, order); + if (folio && !folio_ref_freeze(folio, 1)) { + folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ - pr_warn("HugeTLB head page unexpected inflated ref count\n"); - page = NULL; + pr_warn("HugeTLB unexpected inflated folio ref count\n"); + folio = NULL; } /* - * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this - * indicates an overall state change. Clear bit so that we resume - * normal 'try hard' allocations. + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a + * folio this indicates an overall state change. Clear bit so + * that we resume normal 'try hard' allocations. */ - if (node_alloc_noretry && page && !alloc_try_hard) + if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* - * If we tried hard to get a page but failed, set bit so that + * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ - if (node_alloc_noretry && !page && alloc_try_hard) + if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); - if (!page) { + if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); - return page_folio(page); + return folio; } static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, @@ -2385,8 +2345,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, } /* - * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use hugepages and non-hugepages. + * Dissolve a given free hugetlb folio into free buddy pages. This function + * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages @@ -2398,10 +2358,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ -int dissolve_free_huge_page(struct page *page) +int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; - struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ @@ -2478,13 +2437,13 @@ out: * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. - * Also note that if dissolve_free_huge_page() returns with an error, all - * free hugepages that were dissolved before that error are lost. + * Also note that if dissolve_free_hugetlb_folio() returns with an error, all + * free hugetlb folios that were dissolved before that error are lost. */ -int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; - struct page *page; + struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; @@ -2497,8 +2456,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { - page = pfn_to_page(pfn); - rc = dissolve_free_huge_page(page); + folio = pfn_folio(pfn); + rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } @@ -2605,7 +2564,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { @@ -2620,9 +2579,30 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } spin_unlock_irq(&hugetlb_lock); + /* We cannot fallback to other nodes, as we could break the per-node pool. */ + if (!allow_alloc_fallback) + gfp_mask |= __GFP_THISNODE; + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -2636,6 +2616,8 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; + int node; + nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2650,8 +2632,15 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + folio = NULL; + for_each_node_mask(node, cpuset_current_mems_allowed) { + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + node, NULL); + if (folio) + break; + } + } if (!folio) { alloc_ok = false; break; @@ -3084,11 +3073,8 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - if (new_folio) { - /* Folio has a zero ref count, but needs a ref to be freed */ - folio_ref_unfreeze(new_folio, 1); + if (new_folio) update_and_free_hugetlb_folio(h, new_folio, false); - } return ret; } @@ -3943,7 +3929,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_folio_for_demote(h, folio, false); + remove_hugetlb_folio(h, folio, false); spin_unlock_irq(&hugetlb_lock); /* @@ -3957,7 +3943,6 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); add_hugetlb_folio(h, folio, false); return rc; } @@ -4658,7 +4643,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; - mutex_init(&h->resize_lock); + __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -4881,23 +4866,6 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); -static nodemask_t *policy_mbind_nodemask(gfp_t gfp) -{ -#ifdef CONFIG_NUMA - struct mempolicy *mpol = get_task_policy(current); - - /* - * Only enforce MPOL_BIND policy which overlaps with cpuset policy - * (from policy_nodemask) specifically for hugetlb case - */ - if (mpol->mode == MPOL_BIND && - (apply_policy_zone(mpol, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) - return &mpol->nodes; -#endif - return NULL; -} - static unsigned int allowed_mems_nr(struct hstate *h) { int node; @@ -5032,7 +5000,6 @@ static struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, - { } }; static void hugetlb_sysctl_init(void) @@ -5935,19 +5902,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl, +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct vm_fault *vmf) { - const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(ptep); + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* @@ -5970,7 +5936,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } @@ -5982,6 +5948,13 @@ retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. + * + * Note that we don't rely on the (safer) folio refcount here, because + * copying the hugetlb folio when there are unexpected (temporary) + * folio references could harm simple fork()+exit() users when + * we run out of free hugetlb folios: we would have to kill processes + * in scenarios that used to work. As a side effect, there can still + * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { @@ -5989,7 +5962,7 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; @@ -6016,8 +5989,8 @@ retry_avoidcopy: * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ - spin_unlock(ptl); - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); + spin_unlock(vmf->ptl); + new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); if (IS_ERR(new_folio)) { /* @@ -6042,19 +6015,21 @@ retry_avoidcopy: * * Reacquire both after unmap operation. */ - idx = vma_hugecache_offset(h, vma, haddr); + idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, + vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, + huge_page_size(h)); + if (likely(vmf->pte && + pte_same(huge_ptep_get(vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -6076,37 +6051,38 @@ retry_avoidcopy: if (unlikely(ret)) goto out_release_all; - if (copy_user_large_folio(new_folio, old_folio, address, vma)) { - ret = VM_FAULT_HWPOISON_LARGE; + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, - haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, + vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); + if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); - hugetlb_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, + huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } - spin_unlock(ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* @@ -6114,12 +6090,12 @@ out_release_all: * unshare) */ if (new_folio != old_folio) - restore_reserve_on_error(h, vma, haddr, new_folio); + restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); - spin_lock(ptl); /* Caller expects lock to be held */ + spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; @@ -6128,8 +6104,8 @@ out_release_old: /* * Return whether there is a pagecache page to back given address within VMA. */ -static bool hugetlbfs_pagecache_present(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); @@ -6205,23 +6181,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, return same; } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags, +static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; - spinlock_t *ptl; - unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6240,10 +6212,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { @@ -6264,7 +6236,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6279,7 +6251,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } - folio = alloc_hugetlb_folio(vma, haddr, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, 0); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6293,18 +6265,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + clear_huge_page(&folio->page, vmf->real_address, + pages_per_huge_page(h)); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(folio, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, + vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone @@ -6313,7 +6287,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); ret = VM_FAULT_SIGBUS; goto out; @@ -6340,7 +6315,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6355,23 +6330,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf->address); } - ptl = huge_pte_lock(h, mm, ptep); + vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) - hugetlb_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6380,17 +6355,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(old_pte))) + if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf); + ret = hugetlb_wp(folio, vmf); } - spin_unlock(ptl); + spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages @@ -6407,10 +6382,10 @@ out: return ret; backout: - spin_unlock(ptl); + spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); @@ -6444,8 +6419,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; - spinlock_t *ptl; vm_fault_t ret; u32 hash; struct folio *folio = NULL; @@ -6453,13 +6426,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; - unsigned long haddr = address & huge_page_mask(h); struct vm_fault vmf = { .vma = vma, - .address = haddr, + .address = address & huge_page_mask(h), .real_address = address, .flags = flags, - .pgoff = vma_hugecache_offset(h, vma, haddr), + .pgoff = vma_hugecache_offset(h, vma, + address & huge_page_mask(h)), /* TODO: Track hugetlb faults using vm_fault */ /* @@ -6479,25 +6452,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire vma lock before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with vmf.pte. This prevents huge_pmd_unshare from + * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { + vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); + if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); - if (huge_pte_none_mostly(entry)) { - if (is_pte_marker(entry)) { + vmf.orig_pte = huge_ptep_get(vmf.pte); + if (huge_pte_none_mostly(vmf.orig_pte)) { + if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = - pte_marker_get(pte_to_swp_entry(entry)); + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE; + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } } @@ -6508,21 +6482,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, - ptep, entry, flags, &vmf); + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* - * entry could be a migration/hwpoison entry at this point, so this - * check prevents the kernel from going below assuming that we have - * an active hugepage in pagecache. This goto expects the 2nd page - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will - * properly handle it. + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this + * point, so this check prevents the kernel from going below assuming + * that we have an active hugepage in pagecache. This goto expects + * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) + * check will properly handle it. */ - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) { + if (!pte_present(vmf.orig_pte)) { + if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6531,9 +6504,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, ptep); + migration_entry_wait_huge(vma, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6547,13 +6520,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { + if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf.address); pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, vmf.pgoff); @@ -6561,17 +6534,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pagecache_folio = NULL; } - ptl = huge_pte_lock(h, mm, ptep); + vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && - (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); @@ -6581,18 +6554,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_userfault(&vmf, VM_UFFD_WP); } - entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry, + vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); + set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* - * hugetlb_wp() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ - folio = page_folio(pte_page(entry)); + folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; @@ -6602,24 +6575,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { - if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl, &vmf); + if (!huge_pte_write(vmf.orig_pte)) { + ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { - entry = huge_pte_mkdirty(entry); + vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } - entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + vmf.orig_pte = pte_mkyoung(vmf.orig_pte); + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); @@ -6655,7 +6627,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + /* + * This is used to allocate a temporary hugetlb to hold the copied + * content, which will then be copied again to the final hugetlb + * consuming a reservation. Set the alloc_fallback to false to indicate + * that breaking the per-node hugetlb pool is not allowed in this case. + */ + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; @@ -6885,77 +6863,6 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & huge_page_mask(h); - struct page *page = NULL; - spinlock_t *ptl; - pte_t *pte, entry; - int ret; - - hugetlb_vma_lock_read(vma); - pte = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (!pte) - goto out_unlock; - - ptl = huge_pte_lock(h, mm, pte); - entry = huge_ptep_get(pte); - if (pte_present(entry)) { - page = pte_page(entry); - - if (!huge_pte_write(entry)) { - if (flags & FOLL_WRITE) { - page = NULL; - goto out; - } - - if (gup_must_unshare(vma, flags, page)) { - /* Tell the caller to do unsharing */ - page = ERR_PTR(-EMLINK); - goto out; - } - } - - page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); - - /* - * Note that page may be a sub-page, and with vmemmap - * optimizations the page struct may be read only. - * try_grab_page() will increase the ref count on the - * head page, so this will be OK. - * - * try_grab_page() should always be able to get the page here, - * because we hold the ptl lock and have verified pte_present(). - */ - ret = try_grab_page(page, flags); - - if (WARN_ON_ONCE(ret)) { - page = ERR_PTR(ret); - goto out; - } - - *page_mask = (1U << huge_page_order(h)) - 1; - } -out: - spin_unlock(ptl); -out_unlock: - hugetlb_vma_unlock_read(vma); - - /* - * Fixup retval for dump requests: if pagecache doesn't exist, - * don't try to allocate a new page but just skip it. - */ - if (!page && (flags & FOLL_DUMP) && - !hugetlbfs_pagecache_present(h, vma, address)) - page = ERR_PTR(-EFAULT); - - return page; -} - long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -7879,7 +7786,7 @@ void __init hugetlb_cma_reserve(int order) * huge page demotion. */ res = cma_declare_contiguous_nid(0, size, 0, - PAGE_SIZE << HUGETLB_PAGE_ORDER, + PAGE_SIZE << order, HUGETLB_PAGE_ORDER, false, name, &hugetlb_cma[nid], nid); if (res) { diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index aa4486bd39..e20339a346 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -308,7 +308,7 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, { if (hugetlb_cgroup_disabled() || !h_cg) return; - + lockdep_assert_held(&hugetlb_lock); __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index da177e49d9..8193906515 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -446,6 +446,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); + if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; @@ -481,6 +483,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ + synchronize_rcu(); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } @@ -505,6 +510,9 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, long restored = 0; long ret = 0; + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ + synchronize_rcu(); + list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, @@ -550,6 +558,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); + if (!vmemmap_should_optimize_folio(h, folio)) return ret; @@ -601,6 +611,9 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); + /* avoid writes from page_ref_add_unless() while folding vmemmap */ + synchronize_rcu(); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -644,6 +657,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); + /* avoid writes from page_ref_add_unless() while folding vmemmap */ + synchronize_rcu(); + list_for_each_entry(folio, folio_list, lru) { int ret; @@ -679,7 +695,6 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { .mode = 0644, .proc_handler = proc_dobool, }, - { } }; static int __init hugetlb_vmemmap_init(void) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index d0548e382b..c9d653f51e 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val) { unsigned long pfn = val; struct page *p; - struct page *hpage; + struct folio *folio; int err; if (!capable(CAP_SYS_ADMIN)) @@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val) return -ENXIO; p = pfn_to_page(pfn); - hpage = compound_head(p); + folio = page_folio(p); if (!hwpoison_filter_enable) goto inject; - shake_page(hpage); + shake_folio(folio); /* * This implies unable to support non-LRU pages except free page. */ - if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) + if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) && + !is_free_buddy_page(p)) return 0; /* @@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val) * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ - err = hwpoison_filter(hpage); + err = hwpoison_filter(&folio->page); if (err) return 0; diff --git a/mm/internal.h b/mm/internal.h index 07ad2675a8..cc2c5e07fa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,8 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <linux/tracepoint-defs.h> struct folio_batch; @@ -70,13 +72,30 @@ void page_writeback_init(void); /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. + * + * Don't use this function outside of debugging code. */ -static inline int folio_nr_pages_mapped(struct folio *folio) +static inline int folio_nr_pages_mapped(const struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } -static inline void *folio_raw_mapping(struct folio *folio) +/* + * Retrieve the first entry of a folio based on a provided entry within the + * folio. We cannot rely on folio->swap as there is no guarantee that it has + * been initialized. Used for calling arch_swap_restore() + */ +static inline swp_entry_t folio_swap(swp_entry_t entry, + const struct folio *folio) +{ + swp_entry_t swap = { + .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), + }; + + return swap; +} + +static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -113,6 +132,10 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. + * @any_young: Optional pointer to indicate whether any entry except the + * first one is young. + * @any_dirty: Optional pointer to indicate whether any entry except the + * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. @@ -128,16 +151,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) + bool *any_writable, bool *any_young, bool *any_dirty) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; - bool writable; + bool writable, young, dirty; int nr; if (any_writable) *any_writable = false; + if (any_young) + *any_young = false; + if (any_dirty) + *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); @@ -151,6 +178,10 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); + if (any_dirty) + dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) @@ -166,6 +197,10 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (any_writable) *any_writable |= writable; + if (any_young) + *any_young |= young; + if (any_dirty) + *any_dirty |= dirty; nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, nr); @@ -174,6 +209,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, return min(ptep - start_ptep, max_nr); } + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + swp_entry_t entry = pte_to_swp_entry(pte); + pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), + (swp_offset(entry) + 1))); + + if (pte_swp_soft_dirty(pte)) + new = pte_swp_mksoft_dirty(new); + if (pte_swp_exclusive(pte)) + new = pte_swp_mkexclusive(new); + if (pte_swp_uffd_wp(pte)) + new = pte_swp_mkuffd_wp(new); + + return new; +} + +/** + * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries + * @start_ptep: Page table pointer for the first entry. + * @max_nr: The maximum number of table entries to consider. + * @pte: Page table entry for the first entry. + * + * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs + * containing swap entries all with consecutive offsets and targeting the same + * swap type, all with matching swp pte bits. + * + * max_nr must be at least one and must be limited by the caller so scanning + * cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) +{ + pte_t expected_pte = pte_next_swp_offset(pte); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON(max_nr < 1); + VM_WARN_ON(!is_swap_pte(pte)); + VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + expected_pte = pte_next_swp_offset(expected_pte); + ptep++; + } + + return ptep - start_ptep; +} #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -513,7 +610,8 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - folio_prep_large_rmappable(folio); + if (folio && folio_test_large(folio)) + folio_set_large_rmappable(folio); return folio; } @@ -522,9 +620,12 @@ static inline void prep_compound_head(struct page *page, unsigned int order) struct folio *folio = (struct folio *)page; folio_set_order(folio, order); + atomic_set(&folio->_large_mapcount, -1); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) @@ -559,10 +660,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); - -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -789,13 +886,17 @@ void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -/* - * Return the start of user virtual address at the specific offset within - * a vma. +/** + * vma_address - Find the virtual address a page range is mapped at + * @vma: The vma which maps this object. + * @pgoff: The page offset within its object. + * @nr_pages: The number of pages to consider. + * + * If any page in this range is mapped by this VMA, return the first address + * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long -vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, - struct vm_area_struct *vma) +static inline unsigned long vma_address(struct vm_area_struct *vma, + pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; @@ -815,18 +916,6 @@ vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, } /* - * Return the start of user virtual address of a page within a vma. - * Returns -EFAULT if all of the page is outside the range of vma. - * If page is a compound head, the entire compound page is considered. - */ -static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); -} - -/* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ @@ -947,6 +1036,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; @@ -961,7 +1051,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references); +unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ @@ -1040,17 +1130,13 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; + enum migrate_reason reason; }; /* @@ -1087,24 +1173,25 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags); -void free_zone_device_page(struct page *page); +void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_page(struct page *page); /* * mm/gup.c */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -int __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags); /* * mm/huge_memory.c */ -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, - unsigned int flags); +void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write); +void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write); /* * mm/mmap.c @@ -1189,20 +1276,10 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* - * During GUP-fast we might not get called on the head page for a - * hugetlb page that is mapped using cont-PTE, because GUP-fast does - * not work with the abstracted hugetlb PTEs that always point at the - * head page. For hugetlb, PageAnonExclusive only applies on the head - * page (as it cannot be partially COW-shared), so lookup the head page. - */ - if (unlikely(!PageHead(page) && PageHuge(page))) - page = compound_head(page); - - /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. */ @@ -1245,6 +1322,35 @@ static inline void vma_iter_config(struct vma_iterator *vmi, __mas_set_range(&vmi->mas, index, last - 1); } +static inline void vma_iter_reset(struct vma_iterator *vmi) +{ + mas_reset(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) +{ + return mas_prev_range(&vmi->mas, min); +} + +static inline +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) +{ + return mas_next_range(&vmi->mas, max); +} + +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area(&vmi->mas, min, max - 1, size); +} + +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); +} + /* * VMA Iterator functions shared between nommu and mmap */ @@ -1328,6 +1434,38 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +#ifdef CONFIG_64BIT +static inline int can_do_mseal(unsigned long flags) +{ + if (flags) + return -EINVAL; + + return 0; +} + +bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end); +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior); +#else +static inline int can_do_mseal(unsigned long flags) +{ + return -EPERM; +} + +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return true; +} + +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) +{ + return true; +} +#endif + #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2b994092a2..9958ebc15d 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -16,6 +16,7 @@ #include <linux/static_key.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/vmalloc.h> #include "kasan.h" diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 8350f5c06f..964b848227 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -595,9 +595,9 @@ static unsigned long kfence_init_pool(void) continue; __folio_set_slab(slab_folio(slab)); -#ifdef CONFIG_MEMCG - slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg | - MEMCG_DATA_OBJCGS; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | + MEMCG_DATA_OBJEXTS; #endif } @@ -645,8 +645,8 @@ reset_slab: if (!i || (i % 2)) continue; -#ifdef CONFIG_MEMCG - slab->memcg_data = 0; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); } @@ -1139,8 +1139,8 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); -#ifdef CONFIG_MEMCG - KFENCE_WARN_ON(meta->objcg); +#ifdef CONFIG_MEMCG_KMEM + KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index f46fbb0306..084f5f36e8 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -97,8 +97,8 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; -#ifdef CONFIG_MEMCG - struct obj_cgroup *objcg; +#ifdef CONFIG_MEMCG_KMEM + struct slabobj_ext obj_exts; #endif }; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3883017460..774a97e6e2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -453,7 +453,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -583,7 +583,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); - if (page_mapcount(page) > 1) { + /* See hpage_collapse_scan_pmd(). */ + if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -767,7 +768,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. * * @pte: starting of the PTEs to copy from - * @page: the new hugepage to copy contents to + * @folio: the new hugepage to copy contents to * @pmd: pointer to the new hugepage's PMD * @orig_pmd: the original raw pages' PMD * @vma: the original raw pages' virtual memory area @@ -775,33 +776,29 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ -static int __collapse_huge_page_copy(pte_t *pte, - struct page *page, - pmd_t *pmd, - pmd_t orig_pmd, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) +static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, + unsigned long address, spinlock_t *ptl, + struct list_head *compound_pagelist) { - struct page *src_page; - pte_t *_pte; - pte_t pteval; - unsigned long _address; + unsigned int i; int result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. */ - for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, _address += PAGE_SIZE) { - pteval = ptep_get(_pte); + for (i = 0; i < HPAGE_PMD_NR; i++) { + pte_t pteval = ptep_get(pte + i); + struct page *page = folio_page(folio, i); + unsigned long src_addr = address + i * PAGE_SIZE; + struct page *src_page; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, _address); + clear_user_highpage(page, src_addr); continue; } src_page = pte_page(pteval); - if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { + if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { result = SCAN_COPY_MC; break; } @@ -891,20 +888,6 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) } #endif -static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node, - nodemask_t *nmask) -{ - *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask); - - if (unlikely(!*folio)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - return false; - } - - count_vm_event(THP_COLLAPSE_ALLOC); - return true; -} - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. @@ -917,6 +900,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -927,8 +911,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1059,7 +1042,7 @@ out: return result; } -static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, +static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : @@ -1067,20 +1050,23 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; - if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { - *hpage = NULL; + folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); + if (!folio) { + *foliop = NULL; + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } + count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); - *hpage = NULL; + *foliop = NULL; return SCAN_CGROUP_CHARGE_FAIL; } count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); - *hpage = folio_page(folio, 0); + *foliop = folio; return SCAN_SUCCEED; } @@ -1093,7 +1079,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pte_t *pte; pgtable_t pgtable; struct folio *folio; - struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; @@ -1109,7 +1094,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_read_unlock(mm); - result = alloc_charge_hpage(&hpage, mm, cc); + result = alloc_charge_folio(&folio, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; @@ -1169,7 +1154,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * huge and small TLB entries for the same virtual address to * avoid the risk of CPU bugs in that area. * - * Parallel fast GUP is fine since fast GUP will back off when + * Parallel GUP-fast is fine since GUP-fast will back off when * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); @@ -1208,14 +1193,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, + result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; - folio = page_folio(hpage); /* * The smp_wmb() inside __folio_mark_uptodate() ensures the * copy_huge_page writes become visible before the set_pmd_at() @@ -1224,7 +1208,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); + _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); @@ -1236,14 +1220,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - hpage = NULL; + folio = NULL; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (hpage) - put_page(hpage); + if (folio) + folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); return result; } @@ -1334,8 +1318,20 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_NULL; goto out_unmap; } + folio = page_folio(page); + + if (!folio_test_anon(folio)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } - if (page_mapcount(page) > 1) { + /* + * We treat a single page as shared if any part of the THP + * is shared. "False negatives" from + * folio_likely_mapped_shared() are not expected to matter + * much in practice. + */ + if (folio_likely_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -1345,7 +1341,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } } - folio = page_folio(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. @@ -1366,16 +1361,12 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_LOCK; goto out_unmap; } - if (!folio_test_anon(folio)) { - result = SCAN_PAGE_ANON; - goto out_unmap; - } /* * Check if the page has any GUP (or other external) pins. * * Here the check may be racy: - * it may see total_mapcount > refcount in some cases? + * it may see folio_mapcount() > folio_ref_count(). * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check @@ -1510,8 +1501,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1797,29 +1787,26 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - struct page *hpage; - struct page *page; - struct page *tmp; - struct folio *folio; + struct page *dst; + struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - result = alloc_charge_hpage(&hpage, mm, cc); + result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; - __SetPageLocked(hpage); + __folio_set_locked(new_folio); if (is_shmem) - __SetPageSwapBacked(hpage); - hpage->index = start; - hpage->mapping = mapping; + __folio_set_swapbacked(new_folio); + new_folio->index = start; + new_folio->mapping = mapping; /* * Ensure we have slots for all the pages in the range. This is @@ -1839,11 +1826,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, for (index = start; index < end; index++) { xas_set(&xas, index); - page = xas_load(&xas); + folio = xas_load(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { - if (!page) { + if (!folio) { /* * Stop if extent has been truncated or * hole-punched, and is now completely @@ -1859,7 +1846,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, continue; } - if (xa_is_value(page) || !PageUptodate(page)) { + if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, @@ -1869,28 +1856,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); - page = folio_file_page(folio, index); - } else if (trylock_page(page)) { - get_page(page); + } else if (folio_trylock(folio)) { + folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; goto xa_locked; } } else { /* !is_shmem */ - if (!page || xa_is_value(page)) { + if (!folio || xa_is_value(folio)) { xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); /* drain lru cache to help isolate_lru_page() */ lru_add_drain(); - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { result = SCAN_FAIL; goto xa_unlocked; } - } else if (PageDirty(page)) { + } else if (folio_test_dirty(folio)) { /* * khugepaged only works on read-only fd, * so this page is dirty because it hasn't @@ -1908,12 +1894,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, filemap_flush(mapping); result = SCAN_FAIL; goto xa_unlocked; - } else if (PageWriteback(page)) { + } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); result = SCAN_FAIL; goto xa_unlocked; - } else if (trylock_page(page)) { - get_page(page); + } else if (folio_trylock(folio)) { + folio_get(folio); xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; @@ -1922,35 +1908,31 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } /* - * The page must be locked, so we can drop the i_pages lock + * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - /* make sure the page is up to date */ - if (unlikely(!PageUptodate(page))) { + /* make sure the folio is up to date */ + if (unlikely(!folio_test_uptodate(folio))) { result = SCAN_FAIL; goto out_unlock; } /* * If file was truncated then extended, or hole-punched, before - * we locked the first page, then a THP might be there already. + * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - result = compound_order(head) == HPAGE_PMD_ORDER && - head->index == start + if (folio_test_large(folio)) { + result = folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; goto out_unlock; } - folio = page_folio(page); - if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; @@ -1960,7 +1942,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this - * page is dirty because it hasn't been flushed + * folio is dirty because it hasn't been flushed * since first write. */ result = SCAN_FAIL; @@ -1984,33 +1966,34 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); - VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page); + VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* - * We control three references to the page: + * We control three references to the folio: * - we hold a pin on it; * - one reference from page cache; - * - one from isolate_lru_page; - * If those are the only references, then any new usage of the - * page will have to fetch it from the page cache. That requires - * locking the page to handle truncate, so any new usage will be - * blocked until we unlock page after collapse/during rollback. + * - one from lru_isolate_folio; + * If those are the only references, then any new usage + * of the folio will have to fetch it from the page + * cache. That requires locking the folio to handle + * truncate, so any new usage will be blocked until we + * unlock folio after collapse/during rollback. */ - if (page_count(page) != 3) { + if (folio_ref_count(folio) != 3) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } /* - * Accumulate the pages that are being collapsed. + * Accumulate the folios that are being collapsed. */ - list_add_tail(&page->lru, &pagelist); + list_add_tail(&folio->lru, &pagelist); continue; out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto xa_unlocked; } @@ -2049,23 +2032,27 @@ xa_unlocked: } /* - * The old pages are locked, so they won't change anymore. + * The old folios are locked, so they won't change anymore. */ index = start; - list_for_each_entry(page, &pagelist, lru) { - while (index < page->index) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + dst = folio_page(new_folio, 0); + list_for_each_entry(folio, &pagelist, lru) { + while (index < folio->index) { + clear_highpage(dst); index++; + dst++; } - if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) { + if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) { result = SCAN_COPY_MC; goto rollback; } index++; + dst++; } while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); + clear_highpage(dst); index++; + dst++; } if (nr_none) { @@ -2093,16 +2080,17 @@ xa_unlocked: } /* - * If userspace observed a missing page in a VMA with a MODE_MISSING - * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that - * page. If so, we need to roll back to avoid suppressing such an - * event. Since wp/minor userfaultfds don't give userspace any - * guarantees that the kernel doesn't fill a missing page with a zero - * page, so they don't matter here. + * If userspace observed a missing page in a VMA with + * a MODE_MISSING userfaultfd, then it might expect a + * UFFD_EVENT_PAGEFAULT for that page. If so, we need to + * roll back to avoid suppressing such an event. Since + * wp/minor userfaultfds don't give userspace any + * guarantees that the kernel doesn't fill a missing + * page with a zero page, so they don't matter here. * - * Any userfaultfds registered after this point will not be able to - * observe any missing pages due to the previously inserted retry - * entries. + * Any userfaultfds registered after this point will + * not be able to observe any missing pages due to the + * previously inserted retry entries. */ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { if (userfaultfd_missing(vma)) { @@ -2127,33 +2115,32 @@ immap_locked: xas_lock_irq(&xas); } - folio = page_folio(hpage); - nr = folio_nr_pages(folio); if (is_shmem) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); + __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); + __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none); + __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none); + __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* - * Mark hpage as uptodate before inserting it into the page cache so - * that it isn't mistaken for an fallocated but unwritten page. + * Mark new_folio as uptodate before inserting it into the + * page cache so that it isn't mistaken for an fallocated but + * unwritten page. */ - folio_mark_uptodate(folio); - folio_ref_add(folio, HPAGE_PMD_NR - 1); + folio_mark_uptodate(new_folio); + folio_ref_add(new_folio, HPAGE_PMD_NR - 1); if (is_shmem) - folio_mark_dirty(folio); - folio_add_lru(folio); + folio_mark_dirty(new_folio); + folio_add_lru(new_folio); /* Join all the small entries into a single multi-index entry. */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, folio); + xas_store(&xas, new_folio); WARN_ON_ONCE(xas_error(&xas)); xas_unlock_irq(&xas); @@ -2164,18 +2151,18 @@ immap_locked: retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) result = SCAN_PTE_MAPPED_HUGEPAGE; - folio_unlock(folio); + folio_unlock(new_folio); /* - * The collapse has succeeded, so free the old pages. + * The collapse has succeeded, so free the old folios. */ - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - list_del(&page->lru); - page->mapping = NULL; - ClearPageActive(page); - ClearPageUnevictable(page); - unlock_page(page); - folio_put_refs(page_folio(page), 3); + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio->mapping = NULL; + folio_clear_active(folio); + folio_clear_unevictable(folio); + folio_unlock(folio); + folio_put_refs(folio, 3); } goto out; @@ -2189,11 +2176,11 @@ rollback: shmem_uncharge(mapping->host, nr_none); } - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - list_del(&page->lru); - unlock_page(page); - putback_lru_page(page); - put_page(page); + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio_unlock(folio); + folio_putback_lru(folio); + folio_put(folio); } /* * Undo the updates of filemap_nr_thps_inc for non-SHMEM @@ -2209,13 +2196,13 @@ rollback: smp_mb(); } - hpage->mapping = NULL; + new_folio->mapping = NULL; - unlock_page(hpage); - put_page(hpage); + folio_unlock(new_folio); + folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); - trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); + trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result); return result; } @@ -2223,7 +2210,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; @@ -2235,11 +2222,11 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); rcu_read_lock(); - xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { ++swap; if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { @@ -2254,11 +2241,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - result = compound_order(head) == HPAGE_PMD_ORDER && - head->index == start + if (folio_test_large(folio)) { + result = folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start /* Maybe PMD-mapped */ ? SCAN_PTE_MAPPED_HUGEPAGE : SCAN_PAGE_COMPOUND; @@ -2271,28 +2256,29 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, break; } - node = page_to_nid(page); + node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } cc->node_load[node]++; - if (!PageLRU(page)) { + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; break; } - if (page_count(page) != - 1 + page_mapcount(page) + page_has_private(page)) { + if (folio_ref_count(folio) != + 1 + folio_mapcount(folio) + folio_test_private(folio)) { result = SCAN_PAGE_COUNT; break; } /* - * We probably should check if the page is referenced here, but - * nobody would transfer pte_young() to PageReferenced() for us. - * And rmap walk here is just too costly... + * We probably should check if the folio is referenced + * here, but nobody would transfer pte_young() to + * folio_test_referenced() for us. And rmap walk here + * is just too costly... */ present++; @@ -2314,7 +2300,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); + trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } #else @@ -2376,8 +2362,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2714,8 +2700,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 6a540c2b27..d5b6fba44f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -114,12 +114,6 @@ #define BYTES_PER_POINTER sizeof(void *) -/* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ - __GFP_NOLOCKDEP)) | \ - __GFP_NORETRY | __GFP_NOMEMALLOC | \ - __GFP_NOWARN) - /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; @@ -158,9 +152,9 @@ struct kmemleak_object { int count; /* checksum for detecting modified objects */ u32 checksum; + depot_stack_handle_t trace_handle; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; - depot_stack_handle_t trace_handle; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ char comm[TASK_COMM_LEN]; /* executable name */ @@ -463,7 +457,8 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) /* try the slab allocator first */ if (object_cache) { - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + object = kmem_cache_alloc_noprof(object_cache, + gfp_nested_mask(gfp)); if (object) return object; } @@ -947,7 +942,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); if (scan_area_cache) - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + area = kmem_cache_alloc_noprof(scan_area_cache, + gfp_nested_mask(gfp)); raw_spin_lock_irqsave(&object->lock, flags); if (!area) { diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 0b09daa188..22e8657800 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -285,6 +285,17 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +void kmsan_memmove(void *to, const void *from, size_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(to, (void *)from, size); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_memmove); + /* Helper function to check an URB. */ void kmsan_handle_urb(const struct urb *urb, bool is_out) { @@ -890,14 +890,14 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) free_stable_node(stable_node); } -enum get_ksm_page_flags { - GET_KSM_PAGE_NOLOCK, - GET_KSM_PAGE_LOCK, - GET_KSM_PAGE_TRYLOCK +enum ksm_get_folio_flags { + KSM_GET_FOLIO_NOLOCK, + KSM_GET_FOLIO_LOCK, + KSM_GET_FOLIO_TRYLOCK }; /* - * get_ksm_page: checks if the page indicated by the stable node + * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, @@ -915,10 +915,10 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct ksm_stable_node *stable_node, - enum get_ksm_page_flags flags) +static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, + enum ksm_get_folio_flags flags) { - struct page *page; + struct folio *folio; void *expected_mapping; unsigned long kpfn; @@ -926,8 +926,8 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ - page = pfn_to_page(kpfn); - if (READ_ONCE(page->mapping) != expected_mapping) + folio = pfn_folio(kpfn); + if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* @@ -940,41 +940,41 @@ again: * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ - while (!get_page_unless_zero(page)) { + while (!folio_try_get(folio)) { /* * Another check for page->mapping != expected_mapping would * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) * in the ref_freeze section of __remove_mapping(); but Anon - * page->mapping reset to NULL later, in free_pages_prepare(). + * folio->mapping reset to NULL later, in free_pages_prepare(). */ - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } - if (READ_ONCE(page->mapping) != expected_mapping) { - put_page(page); + if (READ_ONCE(folio->mapping) != expected_mapping) { + folio_put(folio); goto stale; } - if (flags == GET_KSM_PAGE_TRYLOCK) { - if (!trylock_page(page)) { - put_page(page); + if (flags == KSM_GET_FOLIO_TRYLOCK) { + if (!folio_trylock(folio)) { + folio_put(folio); return ERR_PTR(-EBUSY); } - } else if (flags == GET_KSM_PAGE_LOCK) - lock_page(page); + } else if (flags == KSM_GET_FOLIO_LOCK) + folio_lock(folio); - if (flags != GET_KSM_PAGE_NOLOCK) { - if (READ_ONCE(page->mapping) != expected_mapping) { - unlock_page(page); - put_page(page); + if (flags != KSM_GET_FOLIO_NOLOCK) { + if (READ_ONCE(folio->mapping) != expected_mapping) { + folio_unlock(folio); + folio_put(folio); goto stale; } } - return page; + return folio; stale: /* @@ -998,16 +998,16 @@ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; - struct page *page; + struct folio *folio; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); - if (!page) + folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); + if (!folio) goto out; hlist_del(&rmap_item->hlist); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; @@ -1094,11 +1094,11 @@ static inline struct ksm_stable_node *page_stable_node(struct page *page) return folio_stable_node(page_folio(page)); } -static inline void set_page_stable_node(struct page *page, - struct ksm_stable_node *stable_node) +static inline void folio_set_stable_node(struct folio *folio, + struct ksm_stable_node *stable_node) { - VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); + folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS @@ -1107,13 +1107,13 @@ static inline void set_page_stable_node(struct page *page, */ static int remove_stable_node(struct ksm_stable_node *stable_node) { - struct page *page; + struct folio *folio; int err; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); - if (!page) { + folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); + if (!folio) { /* - * get_ksm_page did remove_node_from_stable_tree itself. + * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } @@ -1124,22 +1124,22 @@ static int remove_stable_node(struct ksm_stable_node *stable_node) * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { /* - * The stable node did not yet appear stale to get_ksm_page(), - * since that allows for an unmapped ksm page to be recognized + * The stable node did not yet appear stale to ksm_get_folio(), + * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. - * This page might be in an LRU cache waiting to be freed, - * or it might be PageSwapCache (perhaps under writeback), + * This folio might be in an LRU cache waiting to be freed, + * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ - set_page_stable_node(page, NULL); + folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -1275,23 +1275,24 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int write_protect_page(struct vm_area_struct *vma, struct page *page, +static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; - pvmw.address = page_address_in_vma(page, vma); + if (WARN_ON_ONCE(folio_test_large(folio))) + return err; + + pvmw.address = page_address_in_vma(&folio->page, vma); if (pvmw.address == -EFAULT) goto out; - BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1301,12 +1302,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; - anon_exclusive = PageAnonExclusive(page); + anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - swapped = PageSwapCache(page); - flush_cache_page(vma, pvmw.address, page_to_pfn(page)); + swapped = folio_test_swapcache(folio); + flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make @@ -1326,26 +1327,26 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * Check that no O_DIRECT or similar I/O is in progress on the * page */ - if (page_mapcount(page) + 1 + swapped != page_count(page)) { + if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && - folio_try_share_anon_rmap_pte(page_folio(page), page)) { + folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) - set_page_dirty(page); + folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; @@ -1446,7 +1447,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + set_pte_at(mm, addr, ptep, newpte); folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); @@ -1504,14 +1505,14 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, page, &orig_pte) == 0) { + if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) { if (!kpage) { /* * While we hold page lock, upgrade page from * PageAnon+anon_vma to PageKsm+NULL stable_node: * stable_tree_insert() will update stable_node. */ - set_page_stable_node(page, NULL); + folio_set_stable_node(page_folio(page), NULL); mark_page_accessed(page); /* * Page reclaim just frees a clean page with no dirty @@ -1616,14 +1617,14 @@ bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; - struct page *_tree_page, *tree_page = NULL; + struct folio *folio, *tree_folio = NULL; int nr = 0; int found_rmap_hlist_len; @@ -1642,24 +1643,24 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * - * get_ksm_page can drop the nodes from the + * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ - _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); - if (!_tree_page) + folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); + if (!folio) continue; nr += 1; if (is_page_sharing_candidate(dup)) { if (!found || dup->rmap_hlist_len > found_rmap_hlist_len) { if (found) - put_page(tree_page); + folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; - tree_page = _tree_page; + tree_folio = folio; /* skip put_page for found dup */ if (!prune_stale_stable_nodes) @@ -1667,7 +1668,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, continue; } } - put_page(_tree_page); + folio_put(folio); } if (found) { @@ -1732,7 +1733,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, } *_stable_node_dup = found; - return tree_page; + return tree_folio; } static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, @@ -1749,7 +1750,7 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl } /* - * Like for get_ksm_page, this function can free the *_stable_node and + * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found @@ -1762,16 +1763,16 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, - struct ksm_stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; - return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } /* * _stable_node_dup set to NULL means the stable_node @@ -1784,24 +1785,24 @@ static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_du prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, - struct ksm_stable_node **s_n, - struct rb_root *root) +static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, + struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, - struct ksm_stable_node *s_n, - struct rb_root *root) +static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, + struct rb_root *root) { struct ksm_stable_node *old_stable_node = s_n; - struct page *tree_page; + struct folio *tree_folio; - tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + tree_folio = __stable_node_chain(s_n_d, &s_n, root, false); /* not pruning dups so s_n cannot have changed */ VM_BUG_ON(s_n != old_stable_node); - return tree_page; + return tree_folio; } /* @@ -1821,28 +1822,30 @@ static struct page *stable_tree_search(struct page *page) struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; struct ksm_stable_node *page_node; + struct folio *folio; - page_node = page_stable_node(page); + folio = page_folio(page); + page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ - get_page(page); - return page; + folio_get(folio); + return &folio->page; } - nid = get_kpfn_nid(page_to_pfn(page)); + nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain_prune(&stable_node_dup, &stable_node, root); + tree_folio = chain_prune(&stable_node_dup, &stable_node, root); /* * NOTE: stable_node may have been freed by * chain_prune() if the returned stable_node_dup is @@ -1876,14 +1879,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + KSM_GET_FOLIO_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -1893,8 +1896,8 @@ again: goto again; } - ret = memcmp_pages(page, tree_page); - put_page(tree_page); + ret = memcmp_pages(page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -1905,12 +1908,15 @@ again: if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* - * Test if the migrated page should be merged - * into a stable node dup. If the mapcount is - * 1 we can migrate it with another KSM page - * without adding it to the chain. + * If the mapcount of our migrated KSM folio is + * at most 1, we can merge it with another + * KSM folio where we know that we have space + * for one more mapping without exceeding the + * ksm_max_page_sharing limit: see + * chain_prune(). This way, we can avoid adding + * this stable node to the chain. */ - if (page_mapcount(page) > 1) + if (folio_mapcount(folio) > 1) goto chain_append; } @@ -1937,26 +1943,26 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node_dup, - GET_KSM_PAGE_TRYLOCK); + tree_folio = ksm_get_folio(stable_node_dup, + KSM_GET_FOLIO_TRYLOCK); - if (PTR_ERR(tree_page) == -EBUSY) + if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); - if (unlikely(!tree_page)) + if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; - unlock_page(tree_page); + folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { - put_page(tree_page); + folio_put(tree_folio); goto replace; } - return tree_page; + return &tree_folio->page; } } @@ -1969,8 +1975,8 @@ again: rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { - get_page(page); - return page; + folio_get(folio); + return &folio->page; } else return NULL; @@ -1995,12 +2001,12 @@ replace: &page_node->node, root); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { rb_erase(&stable_node_dup->node, root); - page = NULL; + folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); @@ -2011,16 +2017,16 @@ replace: DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) - get_page(page); + folio_get(folio); else - page = NULL; + folio = NULL; } else { - page = NULL; + folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); - return page; + return &folio->page; chain_append: /* stable_node_dup could be null if it reached the limit */ @@ -2063,7 +2069,7 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct ksm_stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; @@ -2073,7 +2079,7 @@ static struct ksm_stable_node *stable_tree_insert(struct page *kpage) struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; - kpfn = page_to_pfn(kpage); + kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: @@ -2081,13 +2087,13 @@ again: new = &root->rb_node; while (*new) { - struct page *tree_page; + struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; - tree_page = chain(&stable_node_dup, stable_node, root); + tree_folio = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { /* * Either all stable_node dups were full in @@ -2109,14 +2115,14 @@ again: * write protected at all times. Any will work * fine to continue the walk. */ - tree_page = get_ksm_page(stable_node_any, - GET_KSM_PAGE_NOLOCK); + tree_folio = ksm_get_folio(stable_node_any, + KSM_GET_FOLIO_NOLOCK); } VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); - if (!tree_page) { + if (!tree_folio) { /* * If we walked over a stale stable_node, - * get_ksm_page() will call rb_erase() and it + * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate @@ -2126,8 +2132,8 @@ again: goto again; } - ret = memcmp_pages(kpage, tree_page); - put_page(tree_page); + ret = memcmp_pages(&kfolio->page, &tree_folio->page); + folio_put(tree_folio); parent = *new; if (ret < 0) @@ -2146,7 +2152,6 @@ again: INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; - set_page_stable_node(kpage, stable_node_dup); stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { @@ -2165,6 +2170,8 @@ again: stable_node_chain_add_dup(stable_node_dup, stable_node); } + folio_set_stable_node(kfolio, stable_node_dup); + return stable_node_dup; } @@ -2424,7 +2431,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * node in the stable tree and add both rmap_items. */ lock_page(kpage); - stable_node = stable_tree_insert(kpage); + stable_node = stable_tree_insert(page_folio(kpage)); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); @@ -2596,14 +2603,14 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; - struct page *page; + struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { - page = get_ksm_page(stable_node, - GET_KSM_PAGE_NOLOCK); - if (page) - put_page(page); + folio = ksm_get_folio(stable_node, + KSM_GET_FOLIO_NOLOCK); + if (folio) + folio_put(folio); cond_resched(); } } @@ -3169,12 +3176,11 @@ again: /* * Collect processes when the error hit an ksm page. */ -void collect_procs_ksm(struct page *page, struct list_head *to_kill, - int force_early) +void collect_procs_ksm(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; - struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; @@ -3226,11 +3232,11 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible - * to get_ksm_page() before it can see that folio->mapping + * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that folio_test_swapcache has been cleared). */ smp_wmb(); - set_page_stable_node(&folio->page, NULL); + folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ @@ -3253,7 +3259,7 @@ static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* - * Don't get_ksm_page, page has already gone: + * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); @@ -3341,7 +3347,7 @@ static int ksm_memory_callback(struct notifier_block *self, * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, - * otherwise get_ksm_page() might later try to access a + * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, diff --git a/mm/madvise.c b/mm/madvise.c index 1a073fcc4c..a77893462b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -321,6 +321,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; } +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, + struct folio *folio, pte_t *ptep, + pte_t pte, bool *any_young, + bool *any_dirty) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + int max_nr = (end - addr) / PAGE_SIZE; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + any_young, any_dirty); +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -336,6 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; + int nr; if (fatal_signal_pending(current)) return -EINTR; @@ -363,10 +376,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -410,7 +423,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&folio_list, true); + reclaim_pages(&folio_list); return 0; } @@ -423,7 +436,8 @@ restart: return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { + for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { @@ -447,55 +461,64 @@ restart: continue; /* - * Creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be swapped out whole. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; - - if (folio_estimated_sharers(folio) > 1) - break; - if (pageout_anon_only_filter && !folio_test_anon(folio)) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + bool any_young; + + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, NULL); + if (any_young) + ptent = pte_mkyoung(ptent); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_likely_mapped_shared(folio)) + continue; + if (pageout_anon_only_filter && !folio_test_anon(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } } /* * Do not interfere with other mappings of this folio and - * non-LRU folio. + * non-LRU folio. If we have a large folio at this point, we + * know it is fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) + if (!folio_test_lru(folio) || + folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (!pageout && pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + clear_young_dirty_ptes(vma, addr, pte, nr, + CYDP_CLEAR_YOUNG); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* @@ -524,7 +547,7 @@ restart: pte_unmap_unlock(start_pte, ptl); } if (pageout) - reclaim_pages(&folio_list, true); + reclaim_pages(&folio_list); cond_resched(); return 0; @@ -620,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; @@ -628,6 +652,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct folio *folio; int nr_swap = 0; unsigned long next; + int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) @@ -640,7 +665,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) @@ -655,9 +681,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + nr_swap -= nr; + free_swap_and_cache_nr(entry, nr); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); @@ -670,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; /* - * If pmd isn't transhuge but the folio is large and - * is owned by only this process, split it and - * deactivate all pages. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be marked as lazyfree. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; + bool any_young, any_dirty; - if (folio_estimated_sharers(folio) != 1) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, &any_dirty); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_likely_mapped_shared(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte; + if (!start_pte) + break; + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } + + if (any_young) + ptent = pte_mkyoung(ptent); + if (any_dirty) + ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* - * If folio is shared with others, we mustn't clear - * the folio's dirty flag. + * If we have a large folio at this point, we know it is + * fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (folio_mapcount(folio) != 1) { + if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } @@ -723,19 +764,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } if (pte_young(ptent) || pte_dirty(ptent)) { - /* - * Some of architecture(ex, PPC) don't update TLB - * with set_pte_at and tlb_remove_tlb_entry so for - * the portability, remap the pte with old|clean - * after pte clearing. - */ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - - ptent = pte_mkold(ptent); - ptent = pte_mkclean(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } @@ -901,26 +931,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } -static long madvise_populate(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - int behavior) +static long madvise_populate(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; - struct mm_struct *mm = vma->vm_mm; int locked = 1; long pages; - *prev = vma; - while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; - *prev = NULL; - vma = NULL; } if (pages < 0) { switch (pages) { @@ -1021,9 +1044,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1381,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. + * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { @@ -1424,10 +1445,29 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; + /* + * Check if the address range is sealed for do_madvise(). + * can_modify_mm_madv assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { + error = -EPERM; + goto out; + } + blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + error = madvise_populate(mm, start, end, behavior); + break; + default: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + break; + } blk_finish_plug(&plug); + +out: if (write) mmap_write_unlock(mm); else diff --git a/mm/memblock.c b/mm/memblock.c index 08e9806b1c..e81fb68f7f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -754,7 +754,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt /* calculate lose page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (nid == NUMA_NO_NODE) + if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; } @@ -1061,7 +1061,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1118,10 +1118,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1215,9 +1211,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1303,7 +1296,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1339,10 +1332,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, int start_rgn, end_rgn; int i, ret; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; @@ -1452,9 +1441,6 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); @@ -1467,7 +1453,7 @@ again: if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1987,7 +1973,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -2181,7 +2167,7 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; - if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); @@ -2272,7 +2258,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d960151da5..8f2f1bb18c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -350,7 +350,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* * A lot of the calls to the cache allocation functions are expected to be - * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are + * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ @@ -575,6 +575,136 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* Subset of node_stat_item for memcg stats */ +static const unsigned int memcg_node_stat_items[] = { + NR_INACTIVE_ANON, + NR_ACTIVE_ANON, + NR_INACTIVE_FILE, + NR_ACTIVE_FILE, + NR_UNEVICTABLE, + NR_SLAB_RECLAIMABLE_B, + NR_SLAB_UNRECLAIMABLE_B, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, + WORKINGSET_ACTIVATE_ANON, + WORKINGSET_ACTIVATE_FILE, + WORKINGSET_RESTORE_ANON, + WORKINGSET_RESTORE_FILE, + WORKINGSET_NODERECLAIM, + NR_ANON_MAPPED, + NR_FILE_MAPPED, + NR_FILE_PAGES, + NR_FILE_DIRTY, + NR_WRITEBACK, + NR_SHMEM, + NR_SHMEM_THPS, + NR_FILE_THPS, + NR_ANON_THPS, + NR_KERNEL_STACK_KB, + NR_PAGETABLE, + NR_SECONDARY_PAGETABLE, +#ifdef CONFIG_SWAP + NR_SWAPCACHE, +#endif +}; + +static const unsigned int memcg_stat_items[] = { + MEMCG_SWAP, + MEMCG_SOCK, + MEMCG_PERCPU_B, + MEMCG_VMALLOC, + MEMCG_KMEM, + MEMCG_ZSWAP_B, + MEMCG_ZSWAPPED, +}; + +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) +#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ + ARRAY_SIZE(memcg_stat_items)) +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; + +static void init_memcg_stats(void) +{ + int8_t i, j = 0; + + BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX); + + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) + mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; + + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) + mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; +} + +static inline int memcg_stats_index(int idx) +{ + return mem_cgroup_stats_index[idx] - 1; +} + +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_MEMCG_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_MEMCG_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_MEMCG_NODE_STAT_ITEMS]; +}; + +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + int i; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + i = memcg_stats_index(idx); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = READ_ONCE(pn->lruvec_stats->state[i]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +unsigned long lruvec_page_state_local(struct lruvec *lruvec, + enum node_stat_item idx) +{ + struct mem_cgroup_per_node *pn; + long x; + int i; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + i = memcg_stats_index(idx); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + x = READ_ONCE(pn->lruvec_stats->state_local[i]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { PGPGIN, @@ -606,11 +736,13 @@ static const unsigned int memcg_vm_event_stat[] = { }; #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; +static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { - int i; + int8_t i; + + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX); for (i = 0; i < NR_MEMCG_EVENTS; ++i) mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; @@ -632,11 +764,11 @@ struct memcg_vmstats_percpu { /* The above should fit a single cacheline for memcg_rstat_updated() */ /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; + long state[MEMCG_VMSTAT_SIZE]; unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; + long state_prev[MEMCG_VMSTAT_SIZE]; unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ @@ -646,15 +778,15 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; + long state[MEMCG_VMSTAT_SIZE]; unsigned long events[NR_MEMCG_EVENTS]; /* Non-hierarchical (CPU aggregated) page state & events */ - long state_local[MEMCG_NR_STAT]; + long state_local[MEMCG_VMSTAT_SIZE]; unsigned long events_local[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; + long state_pending[MEMCG_VMSTAT_SIZE]; unsigned long events_pending[NR_MEMCG_EVENTS]; /* Stats updates since the last flush */ @@ -715,6 +847,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { struct memcg_vmstats_percpu *statc; int cpu = smp_processor_id(); + unsigned int stats_updates; if (!val) return; @@ -722,8 +855,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) cgroup_rstat_updated(memcg->css.cgroup, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { - statc->stats_updates += abs(val); - if (statc->stats_updates < MEMCG_CHARGE_BATCH) + stats_updates = READ_ONCE(statc->stats_updates) + abs(val); + WRITE_ONCE(statc->stats_updates, stats_updates); + if (stats_updates < MEMCG_CHARGE_BATCH) continue; /* @@ -731,9 +865,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) * redundant. Avoid the overhead of the atomic update. */ if (!memcg_vmstats_needs_flush(statc->vmstats)) - atomic64_add(statc->stats_updates, + atomic64_add(stats_updates, &statc->vmstats->stats_updates); - statc->stats_updates = 0; + WRITE_ONCE(statc->stats_updates, 0); } } @@ -785,7 +919,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w) unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats->state[idx]); + long x; + int i = memcg_stats_index(idx); + + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + + x = READ_ONCE(memcg->vmstats->state[i]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -815,20 +955,31 @@ static int memcg_state_val_in_pages(int idx, int val) * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, + int val) { + int i = memcg_stats_index(idx); + if (mem_cgroup_disabled()) return; - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return; + + __this_cpu_add(memcg->vmstats_percpu->state[i], val); memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats->state_local[idx]); + long x; + int i = memcg_stats_index(idx); + + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return 0; + x = READ_ONCE(memcg->vmstats->state_local[i]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -836,11 +987,16 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, - int val) +static void __mod_memcg_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, + int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; + int i = memcg_stats_index(idx); + + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + return; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -857,8 +1013,6 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, case NR_ANON_MAPPED: case NR_FILE_MAPPED: case NR_ANON_THPS: - case NR_SHMEM_PMDMAPPED: - case NR_FILE_PMDMAPPED: WARN_ON_ONCE(!in_task()); break; default: @@ -867,10 +1021,10 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, } /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + __this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); memcg_stats_unlock(); @@ -952,34 +1106,38 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - int index = memcg_events_index(idx); + int i = memcg_events_index(idx); + + if (mem_cgroup_disabled()) + return; - if (mem_cgroup_disabled() || index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[index], count); + __this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - int index = memcg_events_index(event); + int i = memcg_events_index(event); - if (index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) return 0; - return READ_ONCE(memcg->vmstats->events[index]); + + return READ_ONCE(memcg->vmstats->events[i]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - int index = memcg_events_index(event); + int i = memcg_events_index(event); - if (index < 0) + if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) return 0; - return READ_ONCE(memcg->vmstats->events_local[index]); + return READ_ONCE(memcg->vmstats->events_local[i]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -2030,8 +2188,6 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) if (current->in_user_fault) { css_get(&memcg->css); current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; } return false; } @@ -2310,6 +2466,7 @@ static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned int stock_pages; unsigned long flags; bool ret = false; @@ -2319,8 +2476,9 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) { - stock->nr_pages -= nr_pages; + stock_pages = READ_ONCE(stock->nr_pages); + if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { + WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); ret = true; } @@ -2334,16 +2492,18 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) */ static void drain_stock(struct memcg_stock_pcp *stock) { + unsigned int stock_pages = READ_ONCE(stock->nr_pages); struct mem_cgroup *old = READ_ONCE(stock->cached); if (!old) return; - if (stock->nr_pages) { - page_counter_uncharge(&old->memory, stock->nr_pages); + if (stock_pages) { + page_counter_uncharge(&old->memory, stock_pages); if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock->nr_pages); - stock->nr_pages = 0; + page_counter_uncharge(&old->memsw, stock_pages); + + WRITE_ONCE(stock->nr_pages, 0); } css_put(&old->css); @@ -2369,8 +2529,7 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } /* @@ -2380,6 +2539,7 @@ static void drain_local_stock(struct work_struct *dummy) static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned int stock_pages; stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ @@ -2387,9 +2547,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) css_get(&memcg->css); WRITE_ONCE(stock->cached, memcg); } - stock->nr_pages += nr_pages; + stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; + WRITE_ONCE(stock->nr_pages, stock_pages); - if (stock->nr_pages > MEMCG_CHARGE_BATCH) + if (stock_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); } @@ -2428,7 +2589,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) rcu_read_lock(); memcg = READ_ONCE(stock->cached); - if (memcg && stock->nr_pages && + if (memcg && READ_ONCE(stock->nr_pages) && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; else if (obj_stock_flush_required(stock, root_memcg)) @@ -2978,21 +3139,10 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) } #ifdef CONFIG_MEMCG_KMEM -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ - __GFP_ACCOUNT | __GFP_NOFAIL) -/* - * mod_objcg_mlstate() may be called with irq enabled, so - * mod_memcg_lruvec_state() should be used. - */ -static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, - struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -3000,66 +3150,31 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); + __mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab) -{ - unsigned int objects = objs_per_slab(s, slab); - unsigned long memcg_data; - void *vec; - - gfp &= ~OBJCGS_CLEAR_MASK; - vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, - slab_nid(slab)); - if (!vec) - return -ENOMEM; - - memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; - if (new_slab) { - /* - * If the slab is brand new and nobody can yet access its - * memcg_data, no synchronization is required and memcg_data can - * be simply assigned. - */ - slab->memcg_data = memcg_data; - } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { - /* - * If the slab is already in use, somebody can allocate and - * assign obj_cgroups in parallel. In this case the existing - * objcg vector should be reused. - */ - kfree(vec); - return 0; - } - - kmemleak_not_leak(vec); - return 0; -} - static __always_inline struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in - * slab->memcg_data. + * slab->obj_exts. */ if (folio_test_slab(folio)) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; struct slab *slab; unsigned int off; slab = folio_slab(folio); - objcgs = slab_objcgs(slab); - if (!objcgs) + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; off = obj_to_index(slab->slab_cache, slab, p); - if (objcgs[off]) - return obj_cgroup_memcg(objcgs[off]); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); return NULL; } @@ -3067,7 +3182,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but - * slab->memcg_data has not been freed yet + * slab->obj_exts has not been freed yet * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ @@ -3145,8 +3260,7 @@ static struct obj_cgroup *current_objcg_update(void) if (old) { old = (struct obj_cgroup *) ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); old = NULL; } @@ -3356,7 +3470,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, +static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct memcg_stock_pcp *stock; @@ -3384,12 +3498,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -3415,11 +3529,10 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } } if (nr) - mod_objcg_mlstate(objcg, pgdat, idx, nr); + __mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3482,13 +3595,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + __mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(old, stock->cached_pgdat, + __mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; @@ -3546,8 +3659,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -3602,6 +3714,96 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) refill_obj_stock(objcg, size, true); } +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) +{ + struct obj_cgroup *objcg; + struct slab *slab; + unsigned long off; + size_t i; + + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); + if (!objcg) + return true; + + /* + * slab_alloc_node() avoids the NULL check, so we might be called with a + * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill + * the whole requested size. + * return success as there's nothing to free back + */ + if (unlikely(*p == NULL)) + return true; + + flags &= gfp_allowed_mask; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + return false; + } + + if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) + return false; + + for (i = 0; i < size; i++) { + slab = virt_to_slab(p[i]); + + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_obj_exts(slab)[off].objcg = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } + + return true; +} + +void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, struct slabobj_ext *obj_exts) +{ + for (int i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = obj_exts[off].objcg; + if (!objcg) + continue; + + obj_exts[off].objcg = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -5431,26 +5633,33 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) } #endif -static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) +static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) - return 1; + return false; + + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), + GFP_KERNEL_ACCOUNT, node); + if (!pn->lruvec_stats) + goto fail; pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stats_percpu) { - kfree(pn); - return 1; - } + if (!pn->lruvec_stats_percpu) + goto fail; lruvec_init(&pn->lruvec); pn->memcg = memcg; memcg->nodeinfo[node] = pn; - return 0; + return true; +fail: + kfree(pn->lruvec_stats); + kfree(pn); + return false; } static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) @@ -5461,6 +5670,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return; free_percpu(pn->lruvec_stats_percpu); + kfree(pn->lruvec_stats); kfree(pn); } @@ -5468,8 +5678,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - if (memcg->orig_objcg) - obj_cgroup_put(memcg->orig_objcg); + obj_cgroup_put(memcg->orig_objcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); @@ -5504,7 +5713,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) goto fail; } - memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), + GFP_KERNEL_ACCOUNT); if (!memcg->vmstats) goto fail; @@ -5522,7 +5732,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) } for_each_node(node) - if (alloc_mem_cgroup_per_node_info(memcg, node)) + if (!alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; if (memcg_wb_domain_init(memcg, GFP_KERNEL)) @@ -5588,6 +5798,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_stats(); init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); @@ -5759,7 +5970,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - for (i = 0; i < MEMCG_NR_STAT; i++) { + for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) { /* * Collect the aggregated propagation counts of groups * below us. We're in a per-cpu loop here and this is @@ -5814,18 +6025,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) for_each_node_state(nid, N_MEMORY) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats *lstats = pn->lruvec_stats; + struct lruvec_stats *plstats = NULL; struct lruvec_stats_percpu *lstatc; if (parent) - ppn = parent->nodeinfo[nid]; + plstats = parent->nodeinfo[nid]->lruvec_stats; lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - delta = pn->lruvec_stats.state_pending[i]; + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) { + delta = lstats->state_pending[i]; if (delta) - pn->lruvec_stats.state_pending[i] = 0; + lstats->state_pending[i] = 0; delta_cpu = 0; v = READ_ONCE(lstatc->state[i]); @@ -5836,16 +6048,16 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } if (delta_cpu) - pn->lruvec_stats.state_local[i] += delta_cpu; + lstats->state_local[i] += delta_cpu; if (delta) { - pn->lruvec_stats.state[i] += delta; - if (ppn) - ppn->lruvec_stats.state_pending[i] += delta; + lstats->state[i] += delta; + if (plstats) + plstats->state_pending[i] += delta; } } } - statc->stats_updates = 0; + WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ if (atomic64_read(&memcg->vmstats->stats_updates)) atomic64_set(&memcg->vmstats->stats_updates, 0); @@ -6620,8 +6832,7 @@ static void mem_cgroup_exit(struct task_struct *task) objcg = (struct obj_cgroup *) ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); /* * Some kernel allocations can happen after this point, @@ -7448,6 +7659,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) > 1 && + !folio_test_hugetlb(folio) && + !list_empty(&folio->_deferred_list), folio); /* * Nobody should be changing or seriously looking at diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7751bd78fb..d3c830e817 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -141,7 +141,6 @@ static struct ctl_table memory_failure_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; /* @@ -156,7 +155,7 @@ static int __page_handle_poison(struct page *page) /* * zone_pcp_disable() can't be used here. It will - * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold + * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap * optimization is enabled. This will break current lock dependency * chain and leads to deadlock. @@ -166,7 +165,7 @@ static int __page_handle_poison(struct page *page) * but nothing guarantees that those pages do not get back to a PCP * queue if we need to refill those. */ - ret = dissolve_free_huge_page(page); + ret = dissolve_free_hugetlb_folio(page_folio(page)); if (!ret) { drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); @@ -179,8 +178,8 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo { if (hugepage_or_freepage) { /* - * Doing this check for free pages is also fine since dissolve_free_huge_page - * returns 0 for non-hugetlb pages as well. + * Doing this check for free pages is also fine since + * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well. */ if (__page_handle_poison(page) <= 0) /* @@ -217,6 +216,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); static int hwpoison_filter_dev(struct page *p) { + struct folio *folio = page_folio(p); struct address_space *mapping; dev_t dev; @@ -224,7 +224,7 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - mapping = page_mapping(p); + mapping = folio_mapping(folio); if (mapping == NULL || mapping->host == NULL) return -EINVAL; @@ -370,20 +370,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * Unknown page type encountered. Try to check whether it can turn PageLRU by * lru_add_drain_all. */ -void shake_page(struct page *p) +void shake_folio(struct folio *folio) { - if (PageHuge(p)) + if (folio_test_hugetlb(folio)) return; /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ - if (PageSlab(p)) + if (folio_test_slab(folio)) return; lru_add_drain_all(); } -EXPORT_SYMBOL_GPL(shake_page); +EXPORT_SYMBOL_GPL(shake_folio); + +static void shake_page(struct page *page) +{ + shake_folio(page_folio(page)); +} static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, unsigned long address) @@ -428,21 +433,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ -#define FSDAX_INVALID_PGOFF ULONG_MAX - /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a - * filesystem with a memory failure handler has claimed the - * memory_failure event. In all other cases, page->index and - * page->mapping are sufficient for mapping the page back to its - * corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr, pgoff_t fsdax_pgoff) + unsigned long addr) { struct to_kill *tk; @@ -452,12 +449,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma); - if (is_zone_device_page(p)) { - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->addr = addr; + if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else + else tk->size_shift = page_shift(compound_head(p)); /* @@ -484,10 +479,12 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, } static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + struct vm_area_struct *vma, struct list_head *to_kill, + unsigned long addr) { - __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF); + if (addr == -EFAULT) + return; + __add_to_kill(tsk, p, vma, to_kill, addr); } #ifdef CONFIG_KSM @@ -503,12 +500,13 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } + void add_to_kill_ksm(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr) + unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #endif /* @@ -610,7 +608,6 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) static void collect_procs_anon(struct folio *folio, struct page *page, struct list_head *to_kill, int force_early) { - struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; @@ -622,8 +619,10 @@ static void collect_procs_anon(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); rcu_read_lock(); for_each_process(tsk) { + struct vm_area_struct *vma; struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -632,9 +631,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page, vma = vmac->vma; if (vma->vm_mm != t->mm) continue; - if (!page_mapped_in_vma(page, vma)) - continue; - add_to_kill_anon_file(t, page, vma, to_kill); + addr = page_mapped_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -657,6 +655,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, pgoff = page_to_pgoff(page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); + unsigned long addr; if (!t) continue; @@ -669,8 +668,10 @@ static void collect_procs_file(struct folio *folio, struct page *page, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == t->mm) - add_to_kill_anon_file(t, page, vma, to_kill); + if (vma->vm_mm != t->mm) + continue; + addr = page_address_in_vma(page, vma); + add_to_kill_anon_file(t, page, vma, to_kill, addr); } } rcu_read_unlock(); @@ -682,7 +683,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { - __add_to_kill(tsk, p, vma, to_kill, 0, pgoff); + unsigned long addr = vma_address(vma, pgoff, 1); + __add_to_kill(tsk, p, vma, to_kill, addr); } /* @@ -727,9 +729,9 @@ static void collect_procs(struct folio *folio, struct page *page, { if (!folio->mapping) return; - if (unlikely(PageKsm(page))) - collect_procs_ksm(page, tokill, force_early); - else if (PageAnon(page)) + if (unlikely(folio_test_ksm(folio))) + collect_procs_ksm(folio, page, tokill, force_early); + else if (folio_test_anon(folio)) collect_procs_anon(folio, page, tokill, force_early); else collect_procs_file(folio, page, tokill, force_early); @@ -1089,7 +1091,8 @@ out: */ static int me_pagecache_dirty(struct page_state *ps, struct page *p) { - struct address_space *mapping = page_mapping(p); + struct folio *folio = page_folio(p); + struct address_space *mapping = folio_mapping(folio); SetPageError(p); /* TBD: print more information about the file. */ @@ -1251,7 +1254,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) #define mlock (1UL << PG_mlocked) #define lru (1UL << PG_lru) #define head (1UL << PG_head) -#define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state error_states[] = { @@ -1261,13 +1263,6 @@ static struct page_state error_states[] = { * PG_buddy pages only make a small fraction of all free pages. */ - /* - * Could in theory check if slab page is free or if we can drop - * currently unused objects without touching them. But just - * treat it as standard kernel for now. - */ - { slab, slab, MF_MSG_SLAB, me_kernel }, - { head, head, MF_MSG_HUGE, me_huge_page }, { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, @@ -1294,7 +1289,6 @@ static struct page_state error_states[] = { #undef mlock #undef lru #undef head -#undef slab #undef reserved static void update_per_node_mf_stats(unsigned long pfn, @@ -1567,24 +1561,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page *hpage) +static bool hwpoison_user_mappings(struct folio *folio, struct page *p, + unsigned long pfn, int flags) { - struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; - bool mlocked = PageMlocked(hpage); + bool mlocked = folio_test_mlocked(folio); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) + if (folio_test_reserved(folio) || folio_test_slab(folio) || + folio_test_pgtable(folio) || folio_test_offline(folio)) return true; - if (!(PageLRU(hpage) || PageHuge(p))) + if (!(folio_test_lru(folio) || folio_test_hugetlb(folio))) return true; /* @@ -1594,7 +1588,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(p)) return true; - if (PageSwapCache(p)) { + if (folio_test_swapcache(folio)) { pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu &= ~TTU_HWPOISON; } @@ -1605,11 +1599,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * XXX: the dirty test could be racy: set_page_dirty() may not always * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(hpage); - if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping = folio_mapping(folio); + if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && mapping_can_writeback(mapping)) { - if (page_mkclean(hpage)) { - SetPageDirty(hpage); + if (folio_mkclean(folio)) { + folio_set_dirty(folio); } else { ttu &= ~TTU_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", @@ -1624,7 +1618,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (PageHuge(hpage) && !PageAnon(hpage)) { + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb pages in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1632,7 +1626,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * TTU_RMAP_LOCKED to indicate we have taken the lock * at this higher level. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_folio_mapping_lock_write(folio); if (mapping) { try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); @@ -1644,15 +1638,15 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, unmap_success = !page_mapped(p); if (!unmap_success) - pr_err("%#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", + pfn, folio_mapcount(page_folio(p))); /* * try_to_unmap() might put mlocked page in lru cache, so call * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage); + shake_folio(folio); /* * Now that the dirty bit has been propagated to the @@ -1664,7 +1658,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) || !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); @@ -2108,7 +2102,7 @@ retry: page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } @@ -2197,7 +2191,7 @@ out: int memory_failure(unsigned long pfn, int flags) { struct page *p; - struct page *hpage; + struct folio *folio; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; @@ -2285,8 +2279,8 @@ try_again: } } - hpage = compound_head(p); - if (PageTransHuge(hpage)) { + folio = page_folio(p); + if (folio_test_large(folio)) { /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. @@ -2300,12 +2294,13 @@ try_again: * or unhandlable page. The refcount is bumped iff the * page is a valid handlable page. */ - SetPageHasHWPoisoned(hpage); + folio_set_has_hwpoisoned(folio); if (try_to_split_thp_page(p) < 0) { res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); + folio = page_folio(p); } /* @@ -2316,9 +2311,9 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p); + shake_folio(folio); - lock_page(p); + folio_lock(folio); /* * We're only intended to deal with the non-Compound page here. @@ -2326,11 +2321,11 @@ try_again: * race window. If this happens, we could try again to hopefully * handle the page next round. */ - if (PageCompound(p)) { + if (folio_test_large(folio)) { if (retry) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); flags &= ~MF_COUNT_INCREASED; retry = false; goto try_again; @@ -2346,35 +2341,35 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + page_flags = folio->flags; if (hwpoison_filter(p)) { ClearPageHWPoison(p); - unlock_page(p); - put_page(p); + folio_unlock(folio); + folio_put(folio); res = -EOPNOTSUPP; goto unlock_mutex; } /* - * __munlock_folio() may clear a writeback page's LRU flag without - * page_lock. We need wait writeback completion for this page or it - * may trigger vfs BUG while evict inode. + * __munlock_folio() may clear a writeback folio's LRU flag without + * the folio lock. We need to wait for writeback completion for this + * folio or it may trigger a vfs BUG while evicting inode. */ - if (!PageLRU(p) && !PageWriteback(p)) + if (!folio_test_lru(folio) && !folio_test_writeback(folio)) goto identify_page_state; /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - wait_on_page_writeback(p); + folio_wait_writeback(folio); /* * Now take care of user space mappings. * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, p)) { + if (!hwpoison_user_mappings(folio, p, pfn, flags)) { res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } @@ -2382,7 +2377,8 @@ try_again: /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if (folio_test_lru(folio) && !folio_test_swapcache(folio) && + folio->mapping == NULL) { res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2392,7 +2388,7 @@ identify_page_state: mutex_unlock(&mf_mutex); return res; unlock_page: - unlock_page(p); + folio_unlock(folio); unlock_mutex: mutex_unlock(&mf_mutex); return res; @@ -2550,7 +2546,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (is_huge_zero_page(&folio->page)) { + if (is_huge_zero_folio(folio)) { unpoison_pr_info("Unpoison: huge zero page is not supported %#lx\n", pfn, &unpoison_rs); ret = -EOPNOTSUPP; @@ -2569,8 +2565,8 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || - folio_test_reserved(folio) || PageOffline(&folio->page)) + if (folio_test_slab(folio) || folio_test_pgtable(folio) || + folio_test_reserved(folio) || folio_test_offline(folio)) goto unlock_mutex; /* @@ -2591,7 +2587,7 @@ int unpoison_memory(unsigned long pfn) ghp = get_hwpoison_page(p, MF_UNPOISON); if (!ghp) { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) @@ -2607,7 +2603,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } } else { - if (PageHuge(p)) { + if (folio_test_hugetlb(folio)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { @@ -2685,6 +2681,7 @@ static int soft_offline_in_use_page(struct page *page) struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_FAILURE, }; if (!huge && folio_test_large(folio)) { diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0537664620..6632102bd5 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -36,6 +36,11 @@ struct node_memory_type_map { static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); +/* + * The list is used to store all memory types that are not created + * by a device driver. + */ +static LIST_HEAD(default_memory_types); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; struct memory_dev_type *default_dram_type; @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly; static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); +/* The lock is used to protect `default_dram_perf*` info and nid. */ +static DEFINE_MUTEX(default_dram_perf_lock); static bool default_dram_perf_error; static struct access_coordinate default_dram_perf; static int default_dram_perf_ref_nid = NUMA_NO_NODE; @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem static struct memory_tier *set_node_memory_tier(int node) { struct memory_tier *memtier; - struct memory_dev_type *memtype; + struct memory_dev_type *memtype = default_dram_type; + int adist = MEMTIER_ADISTANCE_DRAM; pg_data_t *pgdat = NODE_DATA(node); @@ -514,7 +522,16 @@ static struct memory_tier *set_node_memory_tier(int node) if (!node_state(node, N_MEMORY)) return ERR_PTR(-EINVAL); - __init_node_memory_type(node, default_dram_type); + mt_calc_adistance(node, &adist); + if (!node_memory_types[node].memtype) { + memtype = mt_find_alloc_memory_type(adist, &default_memory_types); + if (IS_ERR(memtype)) { + memtype = default_dram_type; + pr_info("Failed to allocate a memory type. Fall back.\n"); + } + } + + __init_node_memory_type(node, memtype); memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); @@ -623,6 +640,64 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types) +{ + struct memory_dev_type *mtype; + + list_for_each_entry(mtype, memory_types, list) + if (mtype->adistance == adist) + return mtype; + + mtype = alloc_memory_type(adist); + if (IS_ERR(mtype)) + return mtype; + + list_add(&mtype->list, memory_types); + + return mtype; +} +EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type); + +void mt_put_memory_types(struct list_head *memory_types) +{ + struct memory_dev_type *mtype, *mtn; + + list_for_each_entry_safe(mtype, mtn, memory_types, list) { + list_del(&mtype->list); + put_memory_type(mtype); + } +} +EXPORT_SYMBOL_GPL(mt_put_memory_types); + +/* + * This is invoked via `late_initcall()` to initialize memory tiers for + * CPU-less memory nodes after driver initialization, which is + * expected to provide `adistance` algorithms. + */ +static int __init memory_tier_late_init(void) +{ + int nid; + + guard(mutex)(&memory_tier_lock); + for_each_node_state(nid, N_MEMORY) { + /* + * Some device drivers may have initialized memory tiers + * between `memory_tier_init()` and `memory_tier_late_init()`, + * potentially bringing online memory nodes and + * configuring memory tiers. Exclude them here. + */ + if (node_memory_types[nid].memtype) + continue; + + set_node_memory_tier(nid); + } + + establish_demotion_targets(); + + return 0; +} +late_initcall(memory_tier_late_init); + static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) { pr_info( @@ -634,25 +709,19 @@ static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix) int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { - int rc = 0; - - mutex_lock(&memory_tier_lock); - if (default_dram_perf_error) { - rc = -EIO; - goto out; - } + guard(mutex)(&default_dram_perf_lock); + if (default_dram_perf_error) + return -EIO; if (perf->read_latency + perf->write_latency == 0 || - perf->read_bandwidth + perf->write_bandwidth == 0) { - rc = -EINVAL; - goto out; - } + perf->read_bandwidth + perf->write_bandwidth == 0) + return -EINVAL; if (default_dram_perf_ref_nid == NUMA_NO_NODE) { default_dram_perf = *perf; default_dram_perf_ref_nid = nid; default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); - goto out; + return 0; } /* @@ -680,27 +749,25 @@ int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, pr_info( " disable default DRAM node performance based abstract distance algorithm.\n"); default_dram_perf_error = true; - rc = -EINVAL; + return -EINVAL; } -out: - mutex_unlock(&memory_tier_lock); - return rc; + return 0; } int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { + guard(mutex)(&default_dram_perf_lock); if (default_dram_perf_error) return -EIO; - if (default_dram_perf_ref_nid == NUMA_NO_NODE) - return -ENOENT; - if (perf->read_latency + perf->write_latency == 0 || perf->read_bandwidth + perf->write_bandwidth == 0) return -EINVAL; - mutex_lock(&memory_tier_lock); + if (default_dram_perf_ref_nid == NUMA_NO_NODE) + return -ENOENT; + /* * The abstract distance of a memory node is in direct proportion to * its memory latency (read + write) and inversely proportional to its @@ -713,7 +780,6 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) (default_dram_perf.read_latency + default_dram_perf.write_latency) * (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / (perf->read_bandwidth + perf->write_bandwidth); - mutex_unlock(&memory_tier_lock); return 0; } @@ -826,7 +892,8 @@ static int __init memory_tier_init(void) * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ - default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); + default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, + &default_memory_types); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); @@ -836,6 +903,14 @@ static int __init memory_tier_init(void) * types assigned. */ for_each_node_state(node, N_MEMORY) { + if (!node_state(node, N_CPU)) + /* + * Defer memory tier initialization on + * CPUless numa nodes. These will be initialized + * after firmware and devices are initialized. + */ + continue; + memtier = set_node_memory_tier(node); if (IS_ERR(memtier)) /* diff --git a/mm/memory.c b/mm/memory.c index 4bd6d68f1b..f81760c938 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf); * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ -static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) +static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { + if (!userfaultfd_wp(vmf->vma)) + return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; @@ -989,7 +991,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable); + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1502,8 +1504,7 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); - /* Only sanity-check the first page in a batch. */ - if (unlikely(page_mapcount(page) < 0)) + if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { @@ -1553,7 +1554,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL); + NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, @@ -1631,12 +1632,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { - /* Genuine swap entry, hence a private anon page */ + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) continue; - rss[MM_SWAPENTS]--; - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) @@ -1659,8 +1661,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); @@ -2765,7 +2767,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err = 0; - BUG_ON(pud_huge(*pud)); + BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); @@ -3206,19 +3208,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } +/** + * vmf_anon_prepare - Prepare to handle an anonymous fault. + * @vmf: The vm_fault descriptor passed from the fault handler. + * + * When preparing to insert an anonymous page into a VMA from a + * fault handler, call this function rather than anon_vma_prepare(). + * If this vma does not already have an associated anon_vma and we are + * only protected by the per-VMA lock, the caller must retry with the + * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to + * determine if this VMA can share its anon_vma, and that's not safe to + * do with only the per-VMA lock held for this VMA. + * + * Return: 0 if fault handling can proceed. Any other value should be + * returned to the caller. + */ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; + if (!mmap_read_trylock(vma->vm_mm)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } } if (__anon_vma_prepare(vma)) - return VM_FAULT_OOM; - return 0; + ret = VM_FAULT_OOM; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + mmap_read_unlock(vma->vm_mm); + return ret; } /* @@ -3329,13 +3351,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* @@ -4190,7 +4207,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. @@ -4326,8 +4343,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4352,6 +4369,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) pte_unmap(pte); + if (!orders) + goto fallback; + /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { @@ -4359,6 +4379,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } @@ -4367,6 +4388,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) return folio; } next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } @@ -4382,7 +4404,6 @@ fallback: */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; @@ -4427,8 +4448,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } /* Allocate our own private page. */ - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) @@ -4476,10 +4498,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: - if (uffd_wp) + if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); @@ -4655,9 +4680,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); + bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); @@ -4670,16 +4694,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (unlikely(uffd_wp)) + if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4716,9 +4738,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page; vm_fault_t ret; + bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && + !(vma->vm_flags & VM_SHARED); /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if (is_cow) page = vmf->cow_page; else page = vmf->page; @@ -4754,8 +4778,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { struct folio *folio = page_folio(page); + int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); set_pte_range(vmf, folio, page, 1, vmf->address); + add_mm_counter(vma->vm_mm, type, 1); ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); @@ -5036,9 +5062,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags) { + struct vm_area_struct *vma = vmf->vma; + folio_get(folio); /* Record the current PID acceesing VMA */ @@ -5050,7 +5078,61 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(folio, vma, addr); + return mpol_misplaced(folio, vmf, addr); +} + +static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long fault_addr, pte_t *fault_pte, + bool writable) +{ + pte_t pte, old_pte; + + old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (writable) + pte = pte_mkwrite(pte, vma); + ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + struct folio *folio, pte_t fault_pte, + bool ignore_writable, bool pte_write_upgrade) +{ + int nr = pte_pfn(fault_pte) - folio_pfn(folio); + unsigned long start, end, addr = vmf->address; + unsigned long addr_start = addr - (nr << PAGE_SHIFT); + unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); + pte_t *start_ptep; + + /* Stay within the VMA and within the page table. */ + start = max3(addr_start, pt_start, vma->vm_start); + end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, + vma->vm_end); + start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); + + /* Restore all PTEs' mapping of the large folio */ + for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(start_ptep); + bool writable = false; + + if (!pte_present(ptent) || !pte_protnone(ptent)) + continue; + + if (pfn_folio(pte_pfn(ptent)) != folio) + continue; + + if (!ignore_writable) { + ptent = pte_modify(ptent, vma->vm_page_prot); + writable = pte_write(ptent); + if (!writable && pte_write_upgrade && + can_change_pte_writable(vma, addr, ptent)) + writable = true; + } + + numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); + } } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -5058,11 +5140,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false; + bool writable = false, ignore_writable = false; + bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0; + int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page @@ -5084,7 +5167,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -5092,10 +5175,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* TODO: handle PTE-mapped THP */ - if (folio_test_large(folio)) - goto out_map; - /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5111,10 +5190,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); + nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -5124,13 +5204,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); + target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; + ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5151,20 +5232,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, 1, flags); + task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (writable) - pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + if (folio && folio_test_large(folio)) + numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, + pte_write_upgrade); + else + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5375,7 +5455,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5409,7 +5490,8 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5763,15 +5845,6 @@ retry: if (!vma_start_read(vma)) goto inval; - /* - * find_mergeable_anon_vma uses adjacent vmas which are not locked. - * This check must happen after vma_start_read(); otherwise, a - * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA - * from its anon_vma. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) - goto inval_end_read; - /* Check since vm_start/vm_end might change before we lock the VMA */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; @@ -5869,34 +5942,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) /** * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping * @address: user virtual address * @ptepp: location to store found PTE * @ptlp: location to store the lock for the PTE * * On a successful return, the pointer to the PTE is stored in @ptepp; * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * + * The contents of the PTE are only stable until @ptlp is released using + * pte_unmap_unlock(). This function will fail if the PTE is non-present. + * Present PTEs may include PTEs that map refcounted pages, such as + * anonymous folios in COW mappings. + * + * Callers must be careful when relying on PTE content after + * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, + * callers must protect against invalidation with MMU notifiers; otherwise + * access to the PFN at a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content. * * Return: zero on success, -ve otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; + mmap_assert_locked(mm); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto out; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; @@ -5926,71 +6013,7 @@ out: } EXPORT_SYMBOL_GPL(follow_pte); -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(follow_pfn); - #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, address, pte)) - goto unlock; - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access @@ -6014,11 +6037,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, int offset = offset_in_page(addr); int ret = -EINVAL; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); @@ -6033,7 +6053,7 @@ retry: if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) goto out_unmap; if (!pte_same(pte, ptep_get(ptep))) { @@ -6191,21 +6211,14 @@ void print_vma_addr(char *prefix, unsigned long ip) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, ip); + vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_NOWAIT); - if (buf) { - char *p; - - p = file_path(f, buf, PAGE_SIZE); - if (IS_ERR(p)) - p = "?"; - printk("%s%s[%lx+%lx]", prefix, kbasename(p), - vma->vm_start, - vma->vm_end - vma->vm_start); - free_page((unsigned long)buf); - } + ip -= vma->vm_start; + ip += vma->vm_pgoff << PAGE_SHIFT; + printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, + vma->vm_start, + vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } @@ -6441,3 +6454,15 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a444e2d7dd..431b1f6753 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1841,6 +1841,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) struct migration_target_control mtc = { .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -2050,11 +2051,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, } /* - * Dissolve free hugepages in the memory block before doing + * Dissolve free hugetlb folios in the memory block before doing * offlining actually in order to make hugetlbfs's object * counting consistent. */ - ret = dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); if (ret) { reason = "failure to dissolve huge pages"; goto failed_removal_isolated; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0fe77738d9..a1bf9aa15c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -509,8 +509,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) qp->nr_failed++; return; } - folio = pfn_folio(pmd_pfn(*pmd)); - if (is_huge_zero_page(&folio->page)) { + folio = pmd_folio(*pmd); + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } @@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: @@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, @@ -1070,6 +1068,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; nodes_clear(nmask); @@ -1227,7 +1226,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); - return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) @@ -1504,9 +1504,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { - if (*mode != MPOL_BIND) + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + else return -EINVAL; - *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } @@ -2200,9 +2201,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, nodemask); + page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2217,7 +2218,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, +struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2248,7 +2249,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node(nid, + page = __alloc_pages_node_noprof(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; @@ -2261,7 +2262,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, } } - page = __alloc_pages(gfp, order, nid, nodemask); + page = __alloc_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ @@ -2292,7 +2293,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * * Return: The folio on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; @@ -2300,12 +2301,12 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); - page = alloc_pages_mpol(gfp | __GFP_COMP, order, - pol, ilx, numa_node_id()); + page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } -EXPORT_SYMBOL(vma_alloc_folio); +EXPORT_SYMBOL(vma_alloc_folio_noprof); /** * alloc_pages - Allocate pages. @@ -2321,7 +2322,7 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned int order) +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; @@ -2332,16 +2333,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order) if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); - return alloc_pages_mpol(gfp, order, - pol, NO_INTERLEAVE_INDEX, numa_node_id()); + return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); } -EXPORT_SYMBOL(alloc_pages); +EXPORT_SYMBOL(alloc_pages_noprof); -struct folio *folio_alloc(gfp_t gfp, unsigned int order) +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); + return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } -EXPORT_SYMBOL(folio_alloc); +EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, @@ -2360,13 +2361,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, for (i = 0; i < nodes; i++) { if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } @@ -2503,11 +2504,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) - nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; @@ -2519,7 +2520,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2543,8 +2544,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); - return __alloc_pages_bulk(gfp, nid, nodemask, - nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2718,7 +2719,7 @@ static void sp_free(struct sp_node *n) * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked - * @vma: vm area where folio mapped + * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's @@ -2728,18 +2729,24 @@ static void sp_free(struct sp_node *n) * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); + struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); - int thisnid = cpu_to_node(thiscpu); + int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; + /* + * Make sure ptl is held so that we don't preempt and we + * have a stable smp processor id + */ + lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; @@ -2764,15 +2771,26 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, break; case MPOL_BIND: - /* Optimize placement among multiple nodes via NUMA balancing */ + case MPOL_PREFERRED_MANY: + /* + * Even though MPOL_PREFERRED_MANY can allocate pages outside + * policy nodemask we don't allow numa migration to nodes + * outside policy nodemask for now. This is done so that if we + * want demotion to slow memory to happen, before allocating + * from some DRAM node say 'x', we will end up using a + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario + * we should not promote to node 'x' from slow memory node. + */ if (pol->flags & MPOL_F_MORON) { + /* + * Optimize placement among multiple nodes + * via NUMA balancing + */ if (node_isset(thisnid, pol->nodes)) break; goto out; } - fallthrough; - case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. @@ -2781,7 +2799,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), + node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); @@ -3275,8 +3293,9 @@ out: * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. - * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the - * longest flag, "relative", and to display at least a few node ids. + * Recommend a @maxlen of at least 51 for the longest mode, "weighted + * interleave", plus the longest flag flags, "relative|balancing", and to + * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { @@ -3285,7 +3304,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && + pol != &default_policy && + !(pol >= &preferred_node_policy[0] && + pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } @@ -3313,12 +3335,18 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += snprintf(p, buffer + maxlen - p, "="); /* - * Currently, the only defined flags are mutually exclusive + * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); + + if (flags & MPOL_F_NUMA_BALANCING) { + if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) + p += snprintf(p, buffer + maxlen - p, "|"); + p += snprintf(p, buffer + maxlen - p, "balancing"); + } } if (!nodes_empty(nodes)) diff --git a/mm/mempool.c b/mm/mempool.c index 076c736f5f..3223337135 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -240,22 +240,24 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } -EXPORT_SYMBOL(mempool_init); +EXPORT_SYMBOL(mempool_init_noprof); /** - * mempool_create - create a memory pool + * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. + * @gfp_mask: memory allocation flags + * @node_id: numa node to allocate on * * this function creates and allocates a guaranteed size, preallocated * memory pool. The pool can be used from the mempool_alloc() and mempool_free() @@ -265,21 +267,13 @@ EXPORT_SYMBOL(mempool_init); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) -{ - return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, - GFP_KERNEL, NUMA_NO_NODE); -} -EXPORT_SYMBOL(mempool_create); - -mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) { mempool_t *pool; - pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); + pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; @@ -291,7 +285,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, return pool; } -EXPORT_SYMBOL(mempool_create_node); +EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool @@ -387,7 +381,7 @@ EXPORT_SYMBOL(mempool_resize); * * Return: pointer to the allocated element or %NULL on error. */ -void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; @@ -454,7 +448,7 @@ repeat_alloc: finish_wait(&pool->wait, &wait); goto repeat_alloc; } -EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements @@ -562,7 +556,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); - return kmem_cache_alloc(mem, gfp_mask); + return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); @@ -580,7 +574,7 @@ EXPORT_SYMBOL(mempool_free_slab); void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; - return kmalloc(size, gfp_mask); + return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); @@ -610,7 +604,7 @@ EXPORT_SYMBOL(mempool_kvfree); void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; - return alloc_pages(gfp_mask, order); + return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); diff --git a/mm/memremap.c b/mm/memremap.c index 9e9fb1972f..40d4547ce5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -456,21 +456,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -void free_zone_device_page(struct page *page) +void free_zone_device_folio(struct folio *folio) { - if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) + if (WARN_ON_ONCE(!folio->page.pgmap->ops || + !folio->page.pgmap->ops->page_free)) return; - mem_cgroup_uncharge(page_folio(page)); + mem_cgroup_uncharge(folio); /* * Note: we don't expect anonymous compound pages yet. Once supported * and we could PTE-map them similar to THP, we'd have to clear * PG_anon_exclusive on all tail pages. */ - VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); + if (folio_test_anon(folio)) { + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + __ClearPageAnonExclusive(folio_page(folio, 0)); + } /* * When a device managed page is freed, the folio->mapping field @@ -481,20 +483,20 @@ void free_zone_device_page(struct page *page) * * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need - * to clear page->mapping. + * to clear folio->mapping. */ - page->mapping = NULL; - page->pgmap->ops->page_free(page); + folio->mapping = NULL; + folio->page.pgmap->ops->page_free(folio_page(folio, 0)); - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_COHERENT) + if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE && + folio->page.pgmap->type != MEMORY_DEVICE_COHERENT) /* - * Reset the page count to 1 to prepare for handing out the page + * Reset the refcount to 1 to prepare for handing out the page * again. */ - set_page_count(page, 1); + folio_set_count(folio, 1); else - put_dev_pagemap(page->pgmap); + put_dev_pagemap(folio->page.pgmap); } void zone_device_page_init(struct page *page) @@ -510,9 +512,9 @@ void zone_device_page_init(struct page *page) EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX -bool __put_devmap_managed_page_refs(struct page *page, int refs) +bool __put_devmap_managed_folio_refs(struct folio *folio, int refs) { - if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* @@ -520,9 +522,9 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs) * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (page_ref_sub_return(page, refs) == 1) - wake_up_var(&page->_refcount); + if (folio_ref_sub_return(folio, refs) == 1) + wake_up_var(&folio->_refcount); return true; } -EXPORT_SYMBOL(__put_devmap_managed_page_refs); +EXPORT_SYMBOL(__put_devmap_managed_folio_refs); #endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index 8f99fcea99..a8c6f466e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,7 +113,7 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; - /* Driver shouldn't use PG_isolated bit of page->flags */ + /* Driver shouldn't use the isolated flag */ WARN_ON_ONCE(folio_test_isolated(folio)); folio_set_isolated(folio); folio_unlock(folio); @@ -629,7 +629,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); @@ -1438,7 +1438,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(&src->page); + mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; @@ -1666,6 +1666,40 @@ static int migrate_pages_batch(struct list_head *from, cond_resched(); /* + * The rare folio on the deferred split list should + * be split now. It should not count as a failure: + * but increment nr_failed because, without doing so, + * migrate_pages() may report success with (split but + * unmigrated) pages still on its fromlist; whereas it + * always reports success when its fromlist is empty. + * stats->nr_thp_failed should be increased too, + * otherwise stats inconsistency will happen when + * migrate_pages_batch is called via migrate_pages() + * with MIGRATE_SYNC and MIGRATE_ASYNC. + * + * Only check it without removing it from the list. + * Since the folio can be on deferred_split_scan() + * local list and removing it can cause the local list + * corruption. Folio split process below can handle it + * with the help of folio_ref_freeze(). + * + * nr_pages > 2 is needed to avoid checking order-1 + * page cache folios. They exist, in contrast to + * non-existent order-1 anonymous folios, and do not + * use _deferred_list. + */ + if (nr_pages > 2 && + !list_empty(&folio->_deferred_list)) { + if (try_split_folio(folio, split_folios) == 0) { + nr_failed++; + stats->nr_thp_failed += is_thp; + stats->nr_thp_split += is_thp; + stats->nr_split++; + continue; + } + } + + /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry * on the same folio with the large folio split @@ -2035,7 +2069,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private) gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, - mtc->nmask, gfp_mask); + mtc->nmask, gfp_mask, + htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { @@ -2073,6 +2108,7 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, @@ -2128,7 +2164,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; err = -EACCES; - if (page_mapcount(page) > 1 && !migrate_all) + if (folio_likely_mapped_shared(folio) && !migrate_all) goto out_putfolio; err = -EBUSY; @@ -2581,11 +2617,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, /* * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && + if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b6c27c76e1..aecc71972a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -71,7 +71,7 @@ again: return migrate_vma_collect_hole(start, end, -1, walk); if (pmd_trans_huge(*pmdp)) { - struct page *page; + struct folio *folio; ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_trans_huge(*pmdp))) { @@ -79,21 +79,21 @@ again: goto again; } - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { + folio = pmd_folio(*pmdp); + if (is_huge_zero_folio(folio)) { spin_unlock(ptl); split_huge_pmd(vma, pmdp, addr); } else { int ret; - get_page(page); + folio_get(folio); spin_unlock(ptl); - if (unlikely(!trylock_page(page))) + if (unlikely(!folio_trylock(folio))) return migrate_vma_collect_skip(start, end, walk); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); + ret = split_folio(folio); + folio_unlock(folio); + folio_put(folio); if (ret) return migrate_vma_collect_skip(start, end, walk); @@ -324,6 +324,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) */ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { + struct folio *folio = page_folio(page); + /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for @@ -336,18 +338,18 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) * check them than regular pages, because they can be mapped with a pmd * or with a pte (split pte mapping). */ - if (PageCompound(page)) + if (folio_test_large(folio)) return false; /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) + if (folio_is_zone_device(folio)) extra++; /* For file back page */ - if (page_mapping(page)) - extra += 1 + page_has_private(page); + if (folio_mapping(folio)) + extra += 1 + folio_has_private(folio); - if ((page_count(page) - extra) > page_mapcount(page)) + if ((folio_ref_count(folio) - extra) > folio_mapcount(folio)) return false; return true; @@ -664,13 +666,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); - } else { - /* No need to invalidate - it was non-present before */ - set_pte_at(mm, addr, ptep, entry); - update_mmu_cache(vma, addr, ptep); } + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); pte_unmap_unlock(ptep, ptl); *src = MIGRATE_PFN_MIGRATE; @@ -694,6 +692,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; + struct folio *folio; int r; if (!newpage) { @@ -728,15 +727,12 @@ static void __migrate_device_pages(unsigned long *src_pfns, continue; } - mapping = page_mapping(page); + folio = page_folio(page); + mapping = folio_mapping(folio); if (is_device_private_page(newpage) || is_device_coherent_page(newpage)) { if (mapping) { - struct folio *folio; - - folio = page_folio(page); - /* * For now only support anonymous memory migrating to * device private or coherent memory. @@ -759,11 +755,10 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) r = migrate_folio_extra(mapping, page_folio(newpage), - page_folio(page), - MIGRATE_SYNC_NO_COPY, 1); + folio, MIGRATE_SYNC_NO_COPY, 1); else r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + folio, MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/mlock.c b/mm/mlock.c index 1ed2f2ab37..30b51cdea8 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -378,7 +378,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else diff --git a/mm/mm_init.c b/mm/mm_init.c index 549e76af8f..3ec04933f7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,9 +24,11 @@ #include <linux/page_ext.h> #include <linux/pti.h> #include <linux/pgtable.h> +#include <linux/stackdepot.h> #include <linux/swap.h> #include <linux/cma.h> #include <linux/crash_dump.h> +#include <linux/execmem.h> #include "internal.h" #include "slab.h" #include "shuffle.h" @@ -226,7 +228,6 @@ static unsigned long required_movablecore_percent __initdata; static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata; static bool deferred_struct_pages __meminitdata; @@ -1144,7 +1145,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -1265,6 +1266,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) pr_debug("On node %d totalpages: 0\n", pgdat->node_id); } +static void __init calc_nr_kernel_pages(void) +{ + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + u64 u; +#ifdef CONFIG_HIGHMEM + unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = PFN_UP(start_addr); + end_pfn = PFN_DOWN(end_addr); + + if (start_pfn < end_pfn) { + nr_all_pages += end_pfn - start_pfn; +#ifdef CONFIG_HIGHMEM + start_pfn = clamp(start_pfn, 0, high_zone_low); + end_pfn = clamp(end_pfn, 0, high_zone_low); +#endif + nr_kernel_pages += end_pfn - start_pfn; + } + } +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) @@ -1308,26 +1333,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -static unsigned long __init calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) -{ - unsigned long pages = spanned_pages; - - /* - * Provide a more accurate estimation if there are holes within - * the zone and SPARSEMEM is in use. If there are holes within the - * zone, each populated memory region may cost us one or two extra - * memmap pages due to alignment because memmap pages for each - * populated regions may not be naturally aligned on page boundary. - * So the (present_pages >> 4) heuristic is a tradeoff for that. - */ - if (spanned_pages > present_pages + (present_pages >> 4) && - IS_ENABLED(CONFIG_SPARSEMEM)) - pages = present_pages; - - return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { @@ -1542,15 +1547,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) } #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - * NOTE: this function is only called during early init. - */ static void __init free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; @@ -1561,47 +1557,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, freesize, memmap_pages; - - size = zone->spanned_pages; - freesize = zone->present_pages; - - /* - * Adjust freesize so that it accounts for how much memory - * is used by this zone for memmap. This affects the watermark - * and per-cpu initialisations - */ - memmap_pages = calc_memmap_size(size, freesize); - if (!is_highmem_idx(j)) { - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - pr_debug(" %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); - } - - /* Account for reserved pages */ - if (j == 0 && freesize > dma_reserve) { - freesize -= dma_reserve; - pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); - } - - if (!is_highmem_idx(j)) - nr_kernel_pages += freesize; - /* Charge for highmem memmap if there are enough kernel pages */ - else if (nr_kernel_pages > memmap_pages * 2) - nr_kernel_pages -= memmap_pages; - nr_all_pages += freesize; + unsigned long size = zone->spanned_pages; /* - * Set an approximate value for lowmem here, it will be adjusted - * when the bootmem allocator frees pages into the buddy system. - * And all highmem pages will be managed by the buddy system. + * Initialize zone->managed_pages as 0 , it will be reset + * when memblock allocator frees pages into buddy system. */ - zone_init_internals(zone, j, nid, freesize); + zone_init_internals(zone, j, nid, zone->present_pages); if (!size) continue; @@ -1874,30 +1836,26 @@ void __init free_area_init(unsigned long *max_zone_pfn) panic("Cannot allocate %zuB for node %d.\n", sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); - free_area_init_node(nid); - - /* - * We do not want to confuse userspace by sysfs - * files/directories for node without any memory - * attached to it, so this node is not marked as - * N_MEMORY and not marked online so that no sysfs - * hierarchy will be created via register_one_node for - * it. The pgdat will get fully initialized by - * hotadd_init_pgdat() when memory is hotplugged into - * this node. - */ - continue; } pgdat = NODE_DATA(nid); free_area_init_node(nid); - /* Any memory on that node */ - if (pgdat->node_present_pages) + /* + * No sysfs hierarcy will be created via register_one_node() + *for memory-less node because here it's not marked as N_MEMORY + *and won't be set online later. The benefit is userspace + *program won't be confused by sysfs files/directories of + *memory-less node. The pgdat will get fully initialized by + *hotadd_init_pgdat() when memory is hotplugged into this node. + */ + if (pgdat->node_present_pages) { node_set_state(nid, N_MEMORY); - check_for_memory(pgdat); + check_for_memory(pgdat); + } } + calc_nr_kernel_pages(); memmap_init(); /* disable hash distribution for systems with a single node */ @@ -2057,7 +2015,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, __init_single_page(page, pfn, zid, nid); nr_pages++; } - return (nr_pages); + return nr_pages; } /* @@ -2259,10 +2217,6 @@ zone_empty: * Return true when zone was grown, otherwise return false. We return true even * when we grow less than requested, to let the caller decide if there are * enough pages to satisfy the allocation. - * - * Note: We use noinline because this function is needed only during boot, and - * it is called from a __ref function _deferred_grow_zone. This way we are - * making sure that it is not inlined into permanent text section. */ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { @@ -2412,17 +2366,6 @@ void __init page_alloc_init_late(void) page_alloc_sysctl_init(); } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ - return 0; -} -#endif - /* * Adaptive scale is meant to reduce sizes of hash tables on large memory * machines. As memory size is increased the scale is also increased but at @@ -2465,7 +2408,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SIZE < SZ_1M) @@ -2547,26 +2489,9 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ - dma_reserve = new_dma_reserve; -} - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); @@ -2578,6 +2503,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } + + /* pages were reserved and not allocated */ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + __free_pages_core(page, order); } @@ -2793,4 +2729,5 @@ void __init mm_core_init(void) pti_init(); kmsan_init_runtime(); mm_cache_init(); + execmem_init(); } @@ -1114,21 +1114,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ - next = mas_walk(&mas); + next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); @@ -1255,17 +1255,15 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. + /* + * addr is returned from get_unmapped_area, + * There are two cases: + * 1> MAP_FIXED == false + * unallocated memory, no need to check sealing. + * 1> MAP_FIXED == true + * sealing is checked inside mmap_region when + * do_vmi_munmap is called. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - - if (flags & MAP_FIXED_NOREPLACE) { - if (find_vma_intersection(mm, addr, addr + len)) - return -EEXIST; - } if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); @@ -1280,6 +1278,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (flags & MAP_FIXED_NOREPLACE) { + if (find_vma_intersection(mm, addr, addr + len)) + return -EEXIST; + } + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1294,7 +1304,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; - flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; + flags_mask = LEGACY_MAP_MASK; + if (file->f_op->fop_flags & FOP_MMAP_SYNC) + flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -1514,32 +1526,32 @@ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) - return 0; + return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) - return 1; + return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) - return 0; + return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) - return 1; + return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) - return 1; + return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); @@ -1549,14 +1561,14 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) +static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) - return 0; + return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } @@ -1576,11 +1588,10 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; - - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; @@ -1589,23 +1600,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: - if (mas_empty_area(&mas, low_limit, high_limit - 1, length)) + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; - gap = mas.index; + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; - tmp = mas_next(&mas, ULONG_MAX); + tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } else { - tmp = mas_prev(&mas, 0); + tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } @@ -1628,10 +1645,10 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask; + length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; @@ -1640,24 +1657,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length)) + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; - gap = mas.last + 1 - info->length; + gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; - gap_end = mas.last; - tmp = mas_next(&mas, ULONG_MAX); + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) <= gap_end) { + if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } else { - tmp = mas_prev(&mas, 0); + tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; - mas_reset(&mas); + vma_iter_reset(&vmi); goto retry; } } @@ -1705,7 +1722,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) @@ -1723,12 +1740,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, return addr; } - info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; - info.align_mask = 0; - info.align_offset = 0; return vm_unmapped_area(&info); } @@ -1753,7 +1767,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ @@ -1777,8 +1791,6 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); - info.align_mask = 0; - info.align_offset = 0; addr = vm_unmapped_area(&info); /* @@ -1808,12 +1820,41 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif +#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS +unsigned long +arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) +{ + return arch_get_unmapped_area(filp, addr, len, pgoff, flags); +} + unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) +{ + return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); +} +#endif + +unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, + vm_flags_t vm_flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, + flags, vm_flags); + return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); +} + +unsigned long +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, - unsigned long, unsigned long, unsigned long); + unsigned long, unsigned long, unsigned long) + = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) @@ -1823,7 +1864,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1833,16 +1873,22 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Ensures that larger anonymous mappings are THP aligned. */ - get_area = thp_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; - addr = get_area(file, addr, len, pgoff, flags); + if (get_area) { + addr = get_area(file, addr, len, pgoff, flags); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Ensures that larger anonymous mappings are THP aligned. */ + addr = thp_get_unmapped_area_vmflags(file, addr, len, + pgoff, flags, vm_flags); + } else { + addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + pgoff, flags, vm_flags); + } if (IS_ERR_VALUE(addr)) return addr; @@ -1855,7 +1901,16 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return error ? error : addr; } -EXPORT_SYMBOL(get_unmapped_area); +unsigned long +mm_get_unmapped_area(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (test_bit(MMF_TOPDOWN, &mm->flags)) + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} +EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval @@ -1912,12 +1967,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); - *pprev = mas_prev(&mas, 0); + vma = vma_iter_load(&vmi); + *pprev = vma_prev(&vmi); if (!vma) - vma = mas_next(&mas, ULONG_MAX); + vma = vma_next(&vmi); return vma; } @@ -1971,7 +2026,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address); + VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -1997,15 +2052,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } if (next) - mas_prev_range(&mas, address); + vma_iter_prev_range_limit(&vmi, address); - __mas_set_range(&mas, vma->vm_start, address - 1); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { - mas_destroy(&mas); + vma_iter_free(&vmi); return -ENOMEM; } @@ -2045,7 +2100,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - mas_store_prealloc(&mas, vma); + vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2054,7 +2109,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return error; } @@ -2067,9 +2122,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; @@ -2079,7 +2134,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -EPERM; /* Enforce stack_guard_gap */ - prev = mas_prev(&mas, 0); + prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && @@ -2089,15 +2144,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } if (prev) - mas_next_range(&mas, vma->vm_start); + vma_iter_next_range_limit(&vmi, vma->vm_start); - __mas_set_range(&mas, address, vma->vm_end - 1); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { - mas_destroy(&mas); + vma_iter_free(&vmi); return -ENOMEM; } @@ -2138,7 +2193,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - mas_store_prealloc(&mas, vma); + vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2147,7 +2202,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - mas_destroy(&mas); + vma_iter_free(&vmi); validate_mm(mm); return error; } @@ -2682,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, if (end == start) return -EINVAL; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); @@ -2744,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); + if (error == -EPERM) + return error; + else if (error) return -ENOMEM; /* @@ -3094,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + arch_unmap(mm, start, end); return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } @@ -3242,7 +3316,7 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ @@ -3251,7 +3325,7 @@ void exit_mmap(struct mm_struct *mm) mmap_read_lock(mm); arch_exit_mmap(mm); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); @@ -3264,7 +3338,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* @@ -3274,8 +3348,8 @@ void exit_mmap(struct mm_struct *mm) set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); - mas_set(&mas, vma->vm_end); - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS, + vma_iter_set(&vmi, vma->vm_end); + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); @@ -3284,14 +3358,14 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - mas_set(&mas, vma->vm_end); + vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); remove_vma(vma, true); count++; cond_resched(); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); @@ -3713,7 +3787,7 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); @@ -3725,14 +3799,14 @@ int mm_take_all_locks(struct mm_struct *mm) * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3740,8 +3814,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3749,8 +3823,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - mas_set(&mas, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3809,12 +3883,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1854850b4b..368b840e75 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -19,14 +19,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); #ifdef CONFIG_MEMCG -/* - * Our various events all share the same buffer (because we don't want or need - * to allocate a set of buffers *per event type*), so we need to protect against - * concurrent _reg() and _unreg() calls, and count how many _reg() calls have - * been made. - */ -static DEFINE_MUTEX(reg_lock); -static int reg_refcount; /* Protected by reg_lock. */ +static atomic_t reg_refcount; /* * Size of the buffer for memcg path names. Ignoring stack trace support, @@ -34,136 +27,22 @@ static int reg_refcount; /* Protected by reg_lock. */ */ #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL -/* - * How many contexts our trace events might be called in: normal, softirq, irq, - * and NMI. - */ -#define CONTEXT_COUNT 4 - -struct memcg_path { - local_lock_t lock; - char __rcu *buf; - local_t buf_idx; -}; -static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { - .lock = INIT_LOCAL_LOCK(lock), - .buf_idx = LOCAL_INIT(0), -}; - -static char **tmp_bufs; - -/* Called with reg_lock held. */ -static void free_memcg_path_bufs(void) -{ - struct memcg_path *memcg_path; - int cpu; - char **old = tmp_bufs; - - for_each_possible_cpu(cpu) { - memcg_path = per_cpu_ptr(&memcg_paths, cpu); - *(old++) = rcu_dereference_protected(memcg_path->buf, - lockdep_is_held(®_lock)); - rcu_assign_pointer(memcg_path->buf, NULL); - } - - /* Wait for inflight memcg_path_buf users to finish. */ - synchronize_rcu(); - - old = tmp_bufs; - for_each_possible_cpu(cpu) { - kfree(*(old++)); - } - - kfree(tmp_bufs); - tmp_bufs = NULL; -} - int trace_mmap_lock_reg(void) { - int cpu; - char *new; - - mutex_lock(®_lock); - - /* If the refcount is going 0->1, proceed with allocating buffers. */ - if (reg_refcount++) - goto out; - - tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), - GFP_KERNEL); - if (tmp_bufs == NULL) - goto out_fail; - - for_each_possible_cpu(cpu) { - new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); - if (new == NULL) - goto out_fail_free; - rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); - /* Don't need to wait for inflights, they'd have gotten NULL. */ - } - -out: - mutex_unlock(®_lock); + atomic_inc(®_refcount); return 0; - -out_fail_free: - free_memcg_path_bufs(); -out_fail: - /* Since we failed, undo the earlier ref increment. */ - --reg_refcount; - - mutex_unlock(®_lock); - return -ENOMEM; } void trace_mmap_lock_unreg(void) { - mutex_lock(®_lock); - - /* If the refcount is going 1->0, proceed with freeing buffers. */ - if (--reg_refcount) - goto out; - - free_memcg_path_bufs(); - -out: - mutex_unlock(®_lock); -} - -static inline char *get_memcg_path_buf(void) -{ - struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); - char *buf; - int idx; - - rcu_read_lock(); - buf = rcu_dereference(memcg_path->buf); - if (buf == NULL) { - rcu_read_unlock(); - return NULL; - } - idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - - MEMCG_PATH_BUF_SIZE; - return &buf[idx]; + atomic_dec(®_refcount); } -static inline void put_memcg_path_buf(void) -{ - local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); - rcu_read_unlock(); -} - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - const char *memcg_path; \ - local_lock(&memcg_paths.lock); \ - memcg_path = get_mm_memcg_path(mm); \ - trace_mmap_lock_##type(mm, \ - memcg_path != NULL ? memcg_path : "", \ - ##__VA_ARGS__); \ - if (likely(memcg_path != NULL)) \ - put_memcg_path_buf(); \ - local_unlock(&memcg_paths.lock); \ +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + char buf[MEMCG_PATH_BUF_SIZE]; \ + get_mm_memcg_path(mm, buf, sizeof(buf)); \ + trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ } while (0) #else /* !CONFIG_MEMCG */ @@ -185,37 +64,23 @@ void trace_mmap_lock_unreg(void) #ifdef CONFIG_TRACING #ifdef CONFIG_MEMCG /* - * Write the given mm_struct's memcg path to a percpu buffer, and return a - * pointer to it. If the path cannot be determined, or no buffer was available - * (because the trace event is being unregistered), NULL is returned. - * - * Note: buffers are allocated per-cpu to avoid locking, so preemption must be - * disabled by the caller before calling us, and re-enabled only after the - * caller is done with the pointer. - * - * The caller must call put_memcg_path_buf() once the buffer is no longer - * needed. This must be done while preemption is still disabled. + * Write the given mm_struct's memcg path to a buffer. If the path cannot be + * determined or the trace event is being unregistered, empty string is written. */ -static const char *get_mm_memcg_path(struct mm_struct *mm) +static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) { - char *buf = NULL; - struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct mem_cgroup *memcg; + buf[0] = '\0'; + /* No need to get path if no trace event is registered. */ + if (!atomic_read(®_refcount)) + return; + memcg = get_mem_cgroup_from_mm(mm); if (memcg == NULL) - goto out; - if (unlikely(memcg->css.cgroup == NULL)) - goto out_put; - - buf = get_memcg_path_buf(); - if (buf == NULL) - goto out_put; - - cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); - -out_put: + return; + if (memcg->css.cgroup) + cgroup_path(memcg->css.cgroup, buf, buflen); css_put(&memcg->css); -out: - return buf; } #endif /* CONFIG_MEMCG */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ec3b068cbb..8982e6139d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -424,23 +424,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm, return young; } -void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, - pte_t pte) -{ - struct mmu_notifier *subscription; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist, - srcu_read_lock_held(&srcu)) { - if (subscription->ops->change_pte) - subscription->ops->change_pte(subscription, mm, address, - pte); - } - srcu_read_unlock(&srcu, id); -} - static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { diff --git a/mm/mprotect.c b/mm/mprotect.c index f8a4544b46..8c6cd88252 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -32,6 +32,7 @@ #include <linux/sched/sysctl.h> #include <linux/userfaultfd_k.h> #include <linux/memory-tiers.h> +#include <uapi/linux/mman.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -129,7 +130,8 @@ static long change_pte_range(struct mmu_gather *tlb, /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - folio_ref_count(folio) != 1) + (folio_maybe_dma_pinned(folio) || + folio_likely_mapped_shared(folio))) continue; /* @@ -743,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + /* + * checking if memory is sealed. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(current->mm, start, end))) { + error = -EPERM; + goto out; + } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; diff --git a/mm/mremap.c b/mm/mremap.c index 38d98465f3..5f96bc5ee9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -205,7 +205,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ if (pte_present(pte)) force_flush = true; - pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); } @@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * In mremap_to(). + * Move a VMA to another location, check if src addr is sealed. + * + * Place can_modify_mm here because mremap_to() + * does its own checking for address range, and we only + * check the sealing after passing those checks. + * + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) + return -EPERM; + if (flags & MREMAP_FIXED) { + /* + * In mremap_to(). + * VMA is moved to dst address, and munmap dst first. + * do_munmap will check if dst is sealed. + */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -1062,6 +1080,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* + * Below is shrink/expand case (not mremap_to()) + * Check if src address is sealed, if so, reject. + * In other words, prevent shrinking or expanding a sealed VMA. + * + * Place can_modify_mm here so we can keep the logic related to + * shrink/expand together. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { + ret = -EPERM; + goto out; + } + + /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. * do_vmi_munmap does all the needed commit accounting, and diff --git a/mm/mseal.c b/mm/mseal.c new file mode 100644 index 0000000000..bf783bba8e --- /dev/null +++ b/mm/mseal.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement mseal() syscall. + * + * Copyright (c) 2023,2024 Google, Inc. + * + * Author: Jeff Xu <jeffxu@chromium.org> + */ + +#include <linux/mempolicy.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/mm_inline.h> +#include <linux/mmu_context.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include "internal.h" + +static inline bool vma_is_sealed(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SEALED); +} + +static inline void set_vma_sealed(struct vm_area_struct *vma) +{ + vm_flags_set(vma, VM_SEALED); +} + +/* + * check if a vma is sealed for modification. + * return true, if modification is allowed. + */ +static bool can_modify_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma_is_sealed(vma))) + return false; + + return true; +} + +static bool is_madv_discard(int behavior) +{ + return behavior & + (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | + MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); +} + +static bool is_ro_anon(struct vm_area_struct *vma) +{ + /* check anonymous mapping. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return false; + + /* + * check for non-writable: + * PROT=RO or PKRU is not writeable. + */ + if (!(vma->vm_flags & VM_WRITE) || + !arch_vma_access_permitted(vma, true, false, false)) + return true; + + return false; +} + +/* + * Check if the vmas of a memory range are allowed to be modified. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (unlikely(!can_modify_vma(vma))) + return false; + } + + /* Allow by default. */ + return true; +} + +/* + * Check if the vmas of a memory range are allowed to be modified by madvise. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, + int behavior) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + if (!is_madv_discard(behavior)) + return true; + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) + if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) + return false; + + /* Allow by default. */ + return true; +} + +static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) +{ + int ret = 0; + vm_flags_t oldflags = vma->vm_flags; + + if (newflags == oldflags) + goto out; + + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + set_vma_sealed(vma); +out: + *prev = vma; + return ret; +} + +/* + * Check for do_mseal: + * 1> start is part of a valid vma. + * 2> end is part of a valid vma. + * 3> No gap (unallocated address) between start and end. + * 4> map is sealable. + */ +static int check_mm_seal(unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long nstart = start; + + VMA_ITERATOR(vmi, current->mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (vma->vm_start > nstart) + /* unallocated memory found. */ + return -ENOMEM; + + if (vma->vm_end >= end) + return 0; + + nstart = vma->vm_end; + } + + return -ENOMEM; +} + +/* + * Apply sealing. + */ +static int apply_mm_seal(unsigned long start, unsigned long end) +{ + unsigned long nstart; + struct vm_area_struct *vma, *prev; + + VMA_ITERATOR(vmi, current->mm, start); + + vma = vma_iter_load(&vmi); + /* + * Note: check_mm_seal should already checked ENOMEM case. + * so vma should not be null, same for the other ENOMEM cases. + */ + prev = vma_prev(&vmi); + if (start > vma->vm_start) + prev = vma; + + nstart = start; + for_each_vma_range(vmi, vma, end) { + int error; + unsigned long tmp; + vm_flags_t newflags; + + newflags = vma->vm_flags | VM_SEALED; + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); + if (error) + return error; + nstart = vma_iter_end(&vmi); + } + + return 0; +} + +/* + * mseal(2) seals the VM's meta data from + * selected syscalls. + * + * addr/len: VM address range. + * + * The address range by addr/len must meet: + * start (addr) must be in a valid VMA. + * end (addr + len) must be in a valid VMA. + * no gap (unallocated memory) between start and end. + * start (addr) must be page aligned. + * + * len: len will be page aligned implicitly. + * + * Below VMA operations are blocked after sealing. + * 1> Unmapping, moving to another location, and shrinking + * the size, via munmap() and mremap(), can leave an empty + * space, therefore can be replaced with a VMA with a new + * set of attributes. + * 2> Moving or expanding a different vma into the current location, + * via mremap(). + * 3> Modifying a VMA via mmap(MAP_FIXED). + * 4> Size expansion, via mremap(), does not appear to pose any + * specific risks to sealed VMAs. It is included anyway because + * the use case is unclear. In any case, users can rely on + * merging to expand a sealed VMA. + * 5> mprotect and pkey_mprotect. + * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) + * for anonymous memory, when users don't have write permission to the + * memory. Those behaviors can alter region contents by discarding pages, + * effectively a memset(0) for anonymous memory. + * + * flags: reserved. + * + * return values: + * zero: success. + * -EINVAL: + * invalid input flags. + * start address is not page aligned. + * Address arange (start + len) overflow. + * -ENOMEM: + * addr is not a valid address (not allocated). + * end (start + len) is not a valid address. + * a gap (unallocated memory) between start and end. + * -EPERM: + * - In 32 bit architecture, sealing is not supported. + * Note: + * user can call mseal(2) multiple times, adding a seal on an + * already sealed memory is a no-action (no error). + * + * unseal() is not supported. + */ +static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + size_t len; + int ret = 0; + unsigned long end; + struct mm_struct *mm = current->mm; + + ret = can_do_mseal(flags); + if (ret) + return ret; + + start = untagged_addr(start); + if (!PAGE_ALIGNED(start)) + return -EINVAL; + + len = PAGE_ALIGN(len_in); + /* Check to see whether len was rounded up from small -ve to zero. */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + /* + * First pass, this helps to avoid + * partial sealing in case of error in input address range, + * e.g. ENOMEM error. + */ + ret = check_mm_seal(start, end); + if (ret) + goto out; + + /* + * Second pass, this should success, unless there are errors + * from vma_modify_flags, e.g. merge/split error, or process + * reaching the max supported VMAs, however, those cases shall + * be rare. + */ + ret = apply_mm_seal(start, end); + +out: + mmap_write_unlock(current->mm); + return ret; +} + +SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, + flags) +{ + return do_mseal(start, len, flags); +} diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7c..7296e775e0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -110,55 +110,34 @@ unsigned int kobjsize(const void *objp) return page_size(page); } -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - void vfree(const void *addr) { kfree(addr); } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() * returns only a logical address. */ - return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); + return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } -void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc(size, gfp_mask); + return __vmalloc_noprof(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) @@ -179,11 +158,11 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) return ret; } -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); struct page *vmalloc_to_page(const void *addr) { @@ -217,13 +196,13 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc); +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -237,11 +216,11 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO); + return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -254,11 +233,11 @@ EXPORT_SYMBOL(vzalloc); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return vmalloc(size); + return vmalloc_noprof(size); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -272,11 +251,11 @@ EXPORT_SYMBOL(vmalloc_node); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return vzalloc(size); + return vzalloc_noprof(size); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) @@ -285,11 +264,11 @@ EXPORT_SYMBOL(vzalloc_node); * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -301,15 +280,15 @@ EXPORT_SYMBOL(vmalloc_32); * VM_USERMAP is set on the corresponding VMA so that subsequent calls to * remap_vmalloc_range() are permissible. */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { /* * We'll have to sort out the ZONE_DMA bits for 64-bit, * but for now this can simply use vmalloc_user() directly. */ - return vmalloc_user(size); + return vmalloc_user_noprof(size); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { @@ -355,6 +334,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8d6a207c3c..4d7a0004df 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -724,7 +724,6 @@ static struct ctl_table vm_oom_kill_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {} }; #endif diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c9af72f292..8a1c920901 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -860,13 +860,15 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, } /** - * __wb_calc_thresh - @wb's share of dirty throttling threshold + * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest + * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * - * Note that balance_dirty_pages() will only seriously take it as a hard limit - * when sleeping max_pause per page is not enough to keep the dirty pages under - * control. For example, when the device is completely stalled due to some error - * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * Note that balance_dirty_pages() will only seriously take dirty throttling + * threshold as a hard limit when sleeping max_pause per page is not enough + * to keep the dirty pages under control. For example, when the device is + * completely stalled due to some error conditions, or when there are 1000 + * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * @@ -877,19 +879,20 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * - * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty and PG_writeback pages. + * Return: @wb's dirty limit in pages. For dirty throttling limit, the term + * "dirty" in the context of dirty balancing includes all PG_dirty and + * PG_writeback pages. */ -static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, + unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); - unsigned long thresh = dtc->thresh; u64 wb_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the thresh ratio. + * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); @@ -909,9 +912,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - struct dirty_throttle_control gdtc = { GDTC_INIT(wb), - .thresh = thresh }; - return __wb_calc_thresh(&gdtc); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + return __wb_calc_thresh(&gdtc, thresh); +} + +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; + unsigned long filepages = 0, headroom = 0, writeback = 0; + + gdtc.avail = global_dirtyable_memory(); + gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_WRITEBACK); + + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc.dirty, &writeback); + mdtc.dirty += writeback; + mdtc_calc_avail(&mdtc, filepages, headroom); + domain_dirty_limits(&mdtc); + + return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* @@ -1658,7 +1680,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; @@ -1697,7 +1719,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty */ + unsigned long nr_dirty; long period; long pause; long max_pause; @@ -1718,9 +1740,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); + nr_dirty = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); + gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); @@ -1771,7 +1793,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ - if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + if (!laptop_mode && nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); @@ -2117,7 +2139,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); + thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else @@ -2137,7 +2159,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); + thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh); if (thresh < 2 * wb_stat_error()) reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); else @@ -2313,7 +2335,6 @@ static struct ctl_table vm_page_writeback_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; #endif @@ -2568,6 +2589,7 @@ done: folio_batch_release(&wbc->fbatch); return NULL; } +EXPORT_SYMBOL_GPL(writeback_iter); /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. @@ -2722,17 +2744,20 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) } /* - * Mark the folio dirty, and set it dirty in the page cache, and mark - * the inode dirty. + * Mark the folio dirty, and set it dirty in the page cache. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold folio_memcg_lock(). Most callers have the folio - * locked. A few have the folio blocked from truncation through other - * means (eg zap_vma_pages() has it mapped and is holding the page table - * lock). This can also be called from mark_buffer_dirty(), which I - * cannot prove is always protected against truncate. + * The caller must hold folio_memcg_lock(). It is the caller's + * responsibility to prevent the folio from being truncated while + * this function is in progress, although it may have been truncated + * before this function is called. Most callers have the folio locked. + * A few have the folio blocked from truncation through other means (e.g. + * zap_vma_pages() has it mapped and is holding the page table lock). + * When called from mark_buffer_dirty(), the filesystem should hold a + * reference to the buffer_head that is being marked dirty, which causes + * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00fafda76b..df2c442f1c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -54,6 +54,7 @@ #include <linux/khugepaged.h> #include <linux/delayacct.h> #include <linux/cacheinfo.h> +#include <linux/pgalloc_tag.h> #include <asm/div64.h> #include "internal.h" #include "shuffle.h" @@ -206,24 +207,6 @@ EXPORT_SYMBOL(node_states); gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -/* - * A cached value of the page's pageblock's migratetype, used when the page is - * put on a pcplist. Used to avoid the pageblock migratetype lookup when - * freeing from pcplists in most cases, at the cost of possibly becoming stale. - * Also the migratetype set in the page does not necessarily match the pcplist - * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any - * other index - this ensures that it will be put on the correct CMA freelist. - */ -static inline int get_pcppage_migratetype(struct page *page) -{ - return page->index; -} - -static inline void set_pcppage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -332,7 +315,7 @@ static inline bool deferred_pages_enabled(void) static bool __ref _deferred_grow_zone(struct zone *zone, unsigned int order) { - return deferred_grow_zone(zone, order); + return deferred_grow_zone(zone, order); } #else static inline bool deferred_pages_enabled(void) @@ -525,7 +508,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != pageblock_order); + VM_BUG_ON(order != HPAGE_PMD_ORDER); movable = migratetype == MIGRATE_MOVABLE; @@ -544,7 +527,7 @@ static inline int pindex_to_order(unsigned int pindex) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pindex >= NR_LOWORDER_PCP_LISTS) - order = pageblock_order; + order = HPAGE_PMD_ORDER; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -557,20 +540,12 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == pageblock_order) + if (order == HPAGE_PMD_ORDER) return true; #endif return false; } -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (pcp_allowed_order(order)) /* Via pcp? */ - free_unref_page(page, order); - else - __free_pages_ok(page, order, FPI_NONE); -} - /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -595,20 +570,6 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } -void destroy_large_folio(struct folio *folio) -{ - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - return; - } - - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - - mem_cgroup_uncharge(folio); - free_the_page(&folio->page, folio_order(folio)); -} - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -639,12 +600,14 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations pollute a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock + * unless compaction is also requesting movable pages. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. */ - if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE && + capc->cc->migratetype != MIGRATE_MOVABLE) return false; capc->page = page; @@ -665,23 +628,33 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ -/* Used for pages not on another list */ -static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) { - struct free_area *area = &zone->free_area[order]; + if (is_migrate_isolate(migratetype)) + return; - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } /* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void __add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -691,16 +664,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + /* Free page moving can fail, so it happens before the type update */ + VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), old_mt, 1 << order); + + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(zone, -(1 << order), old_mt); + account_freepages(zone, 1 << order, new_mt); } -static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) +static inline void __del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -711,6 +696,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, zone->free_area[order].nr_free--; } +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + __del_page_from_free_list(page, zone, order, migratetype); + account_freepages(zone, -(1 << order), migratetype); +} + static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -782,16 +774,16 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); + account_freepages(zone, 1 << order, migratetype); + while (order < MAX_PAGE_ORDER) { + int buddy_mt = migratetype; + if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + account_freepages(zone, -(1 << order), migratetype); return; } @@ -806,11 +798,11 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) + if (migratetype != buddy_mt && + (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) goto done_merging; } @@ -819,9 +811,19 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + __del_page_from_free_list(buddy, zone, order, buddy_mt); + + if (unlikely(buddy_mt != migratetype)) { + /* + * Match buddy type. This ensures that an + * expand() down the line puts the sub-blocks + * on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } + combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -838,74 +840,13 @@ done_merging: else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + __add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } -/** - * split_free_page() -- split a free page at split_pfn_offset - * @free_page: the original free page - * @order: the order of the page - * @split_pfn_offset: split offset within the page - * - * Return -ENOENT if the free page is changed, otherwise 0 - * - * It is used when the free page crosses two pageblocks with different migratetypes - * at split_pfn_offset within the page. The split free page will be put into - * separate migratetype lists afterwards. Otherwise, the function achieves - * nothing. - */ -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset) -{ - struct zone *zone = page_zone(free_page); - unsigned long free_page_pfn = page_to_pfn(free_page); - unsigned long pfn; - unsigned long flags; - int free_page_order; - int mt; - int ret = 0; - - if (split_pfn_offset == 0) - return ret; - - spin_lock_irqsave(&zone->lock, flags); - - if (!PageBuddy(free_page) || buddy_order(free_page) != order) { - ret = -ENOENT; - goto out; - } - - mt = get_pfnblock_migratetype(free_page, free_page_pfn); - if (likely(!is_migrate_isolate(mt))) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - - del_page_from_free_list(free_page, zone, order); - for (pfn = free_page_pfn; - pfn < free_page_pfn + (1UL << order);) { - int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - - free_page_order = min_t(unsigned int, - pfn ? __ffs(pfn) : order, - __fls(split_pfn_offset)); - __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, - mt, FPI_NONE); - pfn += 1UL << free_page_order; - split_pfn_offset -= (1UL << free_page_order); - /* we have done the first part, now switch to second part */ - if (split_pfn_offset == 0) - split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); - } -out: - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -1001,6 +942,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) bad_page(page, "nonzero entire_mapcount"); goto out; } + if (unlikely(folio_large_mapcount(folio))) { + bad_page(page, "nonzero large_mapcount"); + goto out; + } if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { bad_page(page, "nonzero nr_pages_mapped"); goto out; @@ -1011,10 +956,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) } break; case 2: - /* - * the second tail page: ->mapping is - * deferred_list.next -- ignore value. - */ + /* the second tail page: deferred_list overlaps ->mapping */ + if (unlikely(!list_empty(&folio->_deferred_list))) { + bad_page(page, "on deferred list"); + goto out; + } break; default: if (page->mapping != TAIL_MAPPING) { @@ -1106,6 +1052,7 @@ __always_inline bool free_pages_prepare(struct page *page, /* Do not let hwpoison pages hit pcplists/buddy */ reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, 1 << order); return false; } @@ -1145,6 +1092,7 @@ __always_inline bool free_pages_prepare(struct page *page, page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, 1 << order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1196,7 +1144,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { unsigned long flags; unsigned int order; - bool isolated_pageblocks; struct page *page; /* @@ -1209,7 +1156,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, pindex = pindex - 1; spin_lock_irqsave(&zone->lock, flags); - isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { struct list_head *list; @@ -1225,23 +1171,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; do { + unsigned long pfn; int mt; page = list_last_entry(list, struct page, pcp_list); - mt = get_pcppage_migratetype(page); + pfn = page_to_pfn(page); + mt = get_pfnblock_migratetype(page, pfn); /* must delete to avoid corrupting pcp list */ list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + __free_one_page(page, pfn, zone, order, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, order, mt); } while (count > 0 && !list_empty(list)); } @@ -1249,18 +1191,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } -static void free_one_page(struct zone *zone, - struct page *page, unsigned long pfn, - unsigned int order, - int migratetype, fpi_t fpi_flags) +static void free_one_page(struct zone *zone, struct page *page, + unsigned long pfn, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; + int migratetype; spin_lock_irqsave(&zone->lock, flags); - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } + migratetype = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); } @@ -1268,21 +1207,13 @@ static void free_one_page(struct zone *zone, static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { - int migratetype; unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order)) return; - /* - * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here - * is used to avoid calling get_pfnblock_migratetype() under the lock. - * This will reduce the lock holding time. - */ - migratetype = get_pfnblock_migratetype(page, pfn); - - free_one_page(zone, page, pfn, order, migratetype, fpi_flags); + free_one_page(zone, page, pfn, order, fpi_flags); __count_vm_events(PGFREE, 1 << order); } @@ -1393,6 +1324,7 @@ static inline void expand(struct zone *zone, struct page *page, int low, int high, int migratetype) { unsigned long size = 1 << high; + unsigned long nr_added = 0; while (high > low) { high--; @@ -1405,12 +1337,14 @@ static inline void expand(struct zone *zone, struct page *page, * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - if (set_page_guard(zone, &page[size], high, migratetype)) + if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype); + __add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); + nr_added += size; } + account_freepages(zone, nr_added, migratetype); } static void check_new_page_bad(struct page *page) @@ -1538,6 +1472,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); + pgalloc_tag_add(page, current, 1 << order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -1578,9 +1513,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, migratetype); expand(zone, page, order, current_order, migratetype); - set_pcppage_migratetype(page, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1597,7 +1531,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { +static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, @@ -1615,30 +1549,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the freelist tail of the requested type. - * Note that start_page and end_pages are not aligned on a pageblock - * boundary. If alignment is required, use move_freepages_block() + * Change the type of a block and move all its free pages to that + * type's freelist. */ -static int move_freepages(struct zone *zone, - unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int *num_movable) +static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, + int old_mt, int new_mt) { struct page *page; - unsigned long pfn; + unsigned long pfn, end_pfn; unsigned int order; int pages_moved = 0; - for (pfn = start_pfn; pfn <= end_pfn;) { + VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); + end_pfn = pageblock_end_pfn(start_pfn); + + for (pfn = start_pfn; pfn < end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { - /* - * We assume that pages that could be isolated for - * migration are movable. But we don't actually try - * isolating, as that would be expensive. - */ - if (num_movable && - (PageLRU(page) || __PageMovable(page))) - (*num_movable)++; pfn++; continue; } @@ -1648,36 +1575,187 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + + move_to_free_list(page, zone, order, old_mt, new_mt); + pfn += 1 << order; pages_moved += 1 << order; } + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); + return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable) +static bool prep_move_freepages_block(struct zone *zone, struct page *page, + unsigned long *start_pfn, + int *num_free, int *num_movable) { - unsigned long start_pfn, end_pfn, pfn; + unsigned long pfn, start, end; + + pfn = page_to_pfn(page); + start = pageblock_start_pfn(pfn); + end = pageblock_end_pfn(pfn); - if (num_movable) + /* + * The caller only has the lock for @zone, don't touch ranges + * that straddle into other zones. While we could move part of + * the range that's inside the zone, this call is usually + * accompanied by other operations such as migratetype updates + * which also should be locked. + */ + if (!zone_spans_pfn(zone, start)) + return false; + if (!zone_spans_pfn(zone, end - 1)) + return false; + + *start_pfn = start; + + if (num_free) { + *num_free = 0; *num_movable = 0; + for (pfn = start; pfn < end;) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + int nr = 1 << buddy_order(page); - pfn = page_to_pfn(page); - start_pfn = pageblock_start_pfn(pfn); - end_pfn = pageblock_end_pfn(pfn) - 1; + *num_free += nr; + pfn += nr; + continue; + } + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (PageLRU(page) || __PageMovable(page)) + (*num_movable)++; + pfn++; + } + } - /* Do not cross zone boundaries */ - if (!zone_spans_pfn(zone, start_pfn)) - start_pfn = pfn; - if (!zone_spans_pfn(zone, end_pfn)) - return 0; + return true; +} + +static int move_freepages_block(struct zone *zone, struct page *page, + int old_mt, int new_mt) +{ + unsigned long start_pfn; - return move_freepages(zone, start_pfn, end_pfn, migratetype, - num_movable); + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return -1; + + return __move_freepages_block(zone, start_pfn, old_mt, new_mt); } +#ifdef CONFIG_MEMORY_ISOLATION +/* Look for a buddy that straddles start_pfn */ +static unsigned long find_large_buddy(unsigned long start_pfn) +{ + int order = 0; + struct page *page; + unsigned long pfn = start_pfn; + + while (!PageBuddy(page = pfn_to_page(pfn))) { + /* Nothing found */ + if (++order > MAX_PAGE_ORDER) + return start_pfn; + pfn &= ~0UL << order; + } + + /* + * Found a preceding buddy, but does it straddle? + */ + if (pfn + (1 << buddy_order(page)) > start_pfn) + return pfn; + + /* Nothing found */ + return start_pfn; +} + +/* Split a multi-block free page into its individual pageblocks */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order) +{ + unsigned long end_pfn = pfn + (1 << order); + + VM_WARN_ON_ONCE(order <= pageblock_order); + VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); + + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + while (pfn != end_pfn) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); + pfn += pageblock_nr_pages; + page = pfn_to_page(pfn); + } +} + +/** + * move_freepages_block_isolate - move free pages in block for page isolation + * @zone: the zone + * @page: the pageblock page + * @migratetype: migratetype to set on the pageblock + * + * This is similar to move_freepages_block(), but handles the special + * case encountered in page isolation, where the block of interest + * might be part of a larger buddy spanning multiple pageblocks. + * + * Unlike the regular page allocator path, which moves pages while + * stealing buddies off the freelist, page isolation is interested in + * arbitrary pfn ranges that may have overlapping buddies on both ends. + * + * This function handles that. Straddling buddies are split into + * individual pageblocks. Only the block of interest is moved. + * + * Returns %true if pages could be moved, %false otherwise. + */ +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) + return false; + + /* No splits needed if buddies can't span multiple blocks */ + if (pageblock_order == MAX_PAGE_ORDER) + goto move; + + /* We're a tail block in a larger buddy */ + pfn = find_large_buddy(start_pfn); + if (pfn != start_pfn) { + struct page *buddy = pfn_to_page(pfn); + int order = buddy_order(buddy); + + del_page_from_free_list(buddy, zone, order, + get_pfnblock_migratetype(buddy, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, buddy, pfn, order); + return true; + } + + /* We're the starting block of a larger buddy */ + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { + int order = buddy_order(page); + + del_page_from_free_list(page, zone, order, + get_pfnblock_migratetype(page, pfn)); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, page, pfn, order); + return true; + } +move: + __move_freepages_block(zone, start_pfn, + get_pfnblock_migratetype(page, start_pfn), + migratetype); + return true; +} +#endif /* CONFIG_MEMORY_ISOLATION */ + static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { @@ -1760,33 +1838,37 @@ static inline bool boost_watermark(struct zone *zone) } /* - * This function implements actual steal behaviour. If order is large enough, - * we can steal whole pageblock. If not, we first move freepages in this - * pageblock to our migratetype and determine how many already-allocated pages - * are there in the pageblock with a compatible migratetype. If at least half - * of pages are free or compatible, we can change migratetype of the pageblock - * itself, so pages freed in the future will be put on the correct free list. + * This function implements actual steal behaviour. If order is large enough, we + * can claim the whole pageblock for the requested migratetype. If not, we check + * the pageblock for constituent pages; if at least half of the pages are free + * or compatible, we can still claim the whole block, so pages freed in the + * future will be put on the correct free list. Otherwise, we isolate exactly + * the order we need from the fallback block and leave its migratetype alone. */ -static void steal_suitable_fallback(struct zone *zone, struct page *page, - unsigned int alloc_flags, int start_type, bool whole_block) +static struct page * +steal_suitable_fallback(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + unsigned int alloc_flags, bool whole_block) { - unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; - int old_block_type; + unsigned long start_pfn; + int block_type; - old_block_type = get_pageblock_migratetype(page); + block_type = get_pageblock_migratetype(page); /* * This can happen due to races and we want to prevent broken * highatomic accounting. */ - if (is_migrate_highatomic(old_block_type)) + if (is_migrate_highatomic(block_type)) goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - goto single_page; + expand(zone, page, order, current_order, start_type); + return page; } /* @@ -1801,10 +1883,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (!whole_block) goto single_page; - free_pages = move_freepages_block(zone, page, start_type, - &movable_pages); /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) + if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, + &movable_pages)) goto single_page; /* @@ -1822,7 +1903,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * vice versa, be conservative since we can't distinguish the * exact migratetype of non-movable pages. */ - if (old_block_type == MIGRATE_MOVABLE) + if (block_type == MIGRATE_MOVABLE) alike_pages = pageblock_nr_pages - (free_pages + movable_pages); else @@ -1833,13 +1914,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); - - return; + page_group_by_mobility_disabled) { + __move_freepages_block(zone, start_pfn, block_type, start_type); + return __rmqueue_smallest(zone, order, start_type); + } single_page: - move_to_free_list(page, zone, current_order, start_type); + del_page_from_free_list(page, zone, current_order, block_type); + expand(zone, page, order, current_order, block_type); + return page; } /* @@ -1877,10 +1960,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * Reserve a pageblock for exclusive use of high-order atomic allocations if - * there are no empty page blocks that contain a page with a suitable order + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -1906,10 +1991,16 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) { + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; zone->nr_reserved_highatomic += pageblock_nr_pages; - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; } out_unlock: @@ -1922,7 +2013,7 @@ out_unlock: * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. * - * If @force is true, try to unreserve a pageblock even though highatomic + * If @force is true, try to unreserve pageblocks even though highatomic * pageblock is exhausted. */ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, @@ -1934,7 +2025,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, struct zone *zone; struct page *page; int order; - bool ret; + int ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { @@ -1949,11 +2040,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); + int mt; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; + mt = get_pageblock_migratetype(page); /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock @@ -1961,7 +2054,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (is_migrate_highatomic_page(page)) { + if (is_migrate_highatomic(mt)) { + unsigned long size; /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -1969,9 +2063,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * while unreserving so be safe and watch for * underflows. */ - zone->nr_reserved_highatomic -= min( - pageblock_nr_pages, - zone->nr_reserved_highatomic); + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; } /* @@ -1983,10 +2077,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype, - NULL); - if (ret) { + if (order < pageblock_order) + ret = move_freepages_block(zone, page, mt, + ac->migratetype); + else { + move_to_free_list(page, zone, order, mt, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } + /* + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { spin_unlock_irqrestore(&zone->lock, flags); return ret; } @@ -2007,7 +2113,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static __always_inline bool +static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { @@ -2054,7 +2160,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, goto do_steal; } - return false; + return NULL; find_smallest: for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { @@ -2074,14 +2180,14 @@ find_smallest: do_steal: page = get_page_from_free_area(area, fallback_mt); - steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, - can_steal); + /* take off list, maybe claim block, expand remainder */ + page = steal_suitable_fallback(zone, page, current_order, order, + start_migratetype, alloc_flags, can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return true; - + return page; } /* @@ -2108,15 +2214,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, return page; } } -retry: + page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (alloc_flags & ALLOC_CMA) page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype, - alloc_flags)) - goto retry; + if (!page) + page = __rmqueue_fallback(zone, order, migratetype, + alloc_flags); } return page; } @@ -2151,12 +2257,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pcppage_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -2221,14 +2322,21 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - struct per_cpu_pages *pcp; + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + int count; - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) { + do { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, pcp->count, pcp, 0); + count = pcp->count; + if (count) { + int to_drain = min(count, + pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + + free_pcppages_bulk(zone, to_drain, pcp, 0); + count -= to_drain; + } spin_unlock(&pcp->lock); - } + } while (count); } /* @@ -2344,19 +2452,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -static bool free_unref_page_prepare(struct page *page, unsigned long pfn, - unsigned int order) -{ - int migratetype; - - if (!free_pages_prepare(page, order)) - return false; - - migratetype = get_pfnblock_migratetype(page, pfn); - set_pcppage_migratetype(page, migratetype); - return true; -} - static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; @@ -2487,9 +2582,14 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype, pcpmigratetype; + int migratetype; - if (!free_unref_page_prepare(page, pfn, order)) + if (!pcp_allowed_order(order)) { + __free_pages_ok(page, order, FPI_NONE); + return; + } + + if (!free_pages_prepare(page, order)) return; /* @@ -2499,23 +2599,23 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = pcpmigratetype = get_pcppage_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, FPI_NONE); return; } - pcpmigratetype = MIGRATE_MOVABLE; + migratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + free_unref_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); } pcp_trylock_finish(UP_flags); } @@ -2528,7 +2628,7 @@ void free_unref_folios(struct folio_batch *folios) unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int i, j, migratetype; + int i, j; /* Prepare folios for freeing */ for (i = 0, j = 0; i < folios->nr; i++) { @@ -2538,18 +2638,15 @@ void free_unref_folios(struct folio_batch *folios) if (order > 0 && folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); - if (!free_unref_page_prepare(&folio->page, pfn, order)) + if (!free_pages_prepare(&folio->page, order)) continue; - /* - * Free isolated folios and orders not handled on the PCP - * directly to the allocator, see comment in free_unref_page. + * Free orders not handled on the PCP directly to the + * allocator. */ - migratetype = get_pcppage_migratetype(&folio->page); - if (!pcp_allowed_order(order) || - is_migrate_isolate(migratetype)) { - free_one_page(folio_zone(folio), &folio->page, pfn, - order, migratetype, FPI_NONE); + if (!pcp_allowed_order(order)) { + free_one_page(folio_zone(folio), &folio->page, + pfn, order, FPI_NONE); continue; } folio->private = (void *)(unsigned long)order; @@ -2562,16 +2659,31 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); unsigned int order = (unsigned long)folio->private; + int migratetype; folio->private = NULL; - migratetype = get_pcppage_migratetype(&folio->page); + migratetype = get_pfnblock_migratetype(&folio->page, pfn); /* Different zone requires a different pcp lock */ - if (zone != locked_zone) { + if (zone != locked_zone || + is_migrate_isolate(migratetype)) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); + locked_zone = NULL; + pcp = NULL; + } + + /* + * Free isolated pages directly to the + * allocator, see comment in free_unref_page. + */ + if (is_migrate_isolate(migratetype)) { + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); + continue; } /* @@ -2582,10 +2694,8 @@ void free_unref_folios(struct folio_batch *folios) pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, &folio->page, - folio_pfn(folio), order, - migratetype, FPI_NONE); - locked_zone = NULL; + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); continue; } locked_zone = zone; @@ -2628,6 +2738,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order, 0); + pgalloc_tag_split(page, 1 << order); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -2648,11 +2759,9 @@ int __isolate_free_page(struct page *page, unsigned int order) watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* * Set the pageblock if the isolated page is at least half of a @@ -2667,8 +2776,8 @@ int __isolate_free_page(struct page *page, unsigned int order) * with others) */ if (migratetype_is_mergeable(mt)) - set_pageblock_migratetype(page, - MIGRATE_MOVABLE); + move_freepages_block(zone, page, mt, + MIGRATE_MOVABLE); } } @@ -2752,8 +2861,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return NULL; } } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); @@ -3326,7 +3433,7 @@ try_this_zone: * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone); + reserve_highatomic_pageblock(page, order, zone); return page; } else { @@ -4389,7 +4496,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * * Returns the number of pages on the list or array. */ -unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, +unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array) @@ -4525,7 +4632,7 @@ failed_irq: pcp_trylock_finish(UP_flags); failed: - page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) { if (page_list) list_add(&page->lru, page_list); @@ -4536,13 +4643,13 @@ failed: goto out; } -EXPORT_SYMBOL_GPL(__alloc_pages_bulk); +EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask) +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4604,38 +4711,38 @@ out: return page; } -EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(__alloc_pages_noprof); -struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, +struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { - struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, preferred_nid, nodemask); return page_rmappable_folio(page); } -EXPORT_SYMBOL(__folio_alloc); +EXPORT_SYMBOL(__folio_alloc_noprof); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if * you need to access high mem. */ -unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) { struct page *page; - page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); + page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); } -EXPORT_SYMBOL(__get_free_pages); +EXPORT_SYMBOL(get_free_pages_noprof); -unsigned long get_zeroed_page(gfp_t gfp_mask) +unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) { - return __get_free_page(gfp_mask | __GFP_ZERO); + return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); } -EXPORT_SYMBOL(get_zeroed_page); +EXPORT_SYMBOL(get_zeroed_page_noprof); /** * __free_pages - Free pages allocated with alloc_pages(). @@ -4661,12 +4768,15 @@ void __free_pages(struct page *page, unsigned int order) { /* get PageHead before we drop reference */ int head = PageHead(page); + struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_the_page(page, order); - else if (!head) + free_unref_page(page, order); + else if (!head) { + pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_the_page(page + (1 << order), order); + free_unref_page(page + (1 << order), order); + } } EXPORT_SYMBOL(__free_pages); @@ -4727,7 +4837,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -4768,7 +4878,7 @@ refill: goto refill; if (unlikely(nc->pfmemalloc)) { - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); goto refill; } @@ -4812,7 +4922,7 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); @@ -4825,6 +4935,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order, 0); + pgalloc_tag_split(page, 1 << order); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); @@ -4851,7 +4962,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * * Return: pointer to the allocated area or %NULL in case of error. */ -void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); unsigned long addr; @@ -4859,10 +4970,10 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - addr = __get_free_pages(gfp_mask, order); + addr = get_free_pages_noprof(gfp_mask, order); return make_alloc_exact(addr, order, size); } -EXPORT_SYMBOL(alloc_pages_exact); +EXPORT_SYMBOL(alloc_pages_exact_noprof); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous @@ -4876,7 +4987,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Return: pointer to the allocated area or %NULL in case of error. */ -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); struct page *p; @@ -4884,7 +4995,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - p = alloc_pages_node(nid, gfp_mask, order); + p = alloc_pages_node_noprof(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -5185,37 +5296,13 @@ static void setup_min_slab_ratio(void); static void build_zonelists(pg_data_t *pgdat) { - int node, local_node; struct zoneref *zonerefs; int nr_zones; - local_node = pgdat->node_id; - zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; nr_zones = build_zonerefs_node(pgdat, zonerefs); zonerefs += nr_zones; - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); - zonerefs += nr_zones; - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); - zonerefs += nr_zones; - } - zonerefs->zone = NULL; zonerefs->zone_idx = 0; } @@ -5722,6 +5809,23 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } +void free_reserved_page(struct page *page) +{ + if (mem_alloc_profiling_enabled()) { + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } + } + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + adjust_managed_page_count(page, 1); +} +EXPORT_SYMBOL(free_reserved_page); + static int page_alloc_cpu_dead(unsigned int cpu) { struct zone *zone; @@ -5832,10 +5936,11 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; + bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear) + if (clear || empty) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; @@ -6216,7 +6321,6 @@ static struct ctl_table page_alloc_sysctl_table[] = { .extra2 = SYSCTL_ONE_HUNDRED, }, #endif - {} }; void __init page_alloc_sysctl_init(void) @@ -6256,6 +6360,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .reason = MR_CONTIG_RANGE, }; struct page *page; unsigned long total_mapped = 0; @@ -6288,8 +6393,12 @@ int __alloc_contig_migrate_range(struct compact_control *cc, if (trace_mm_alloc_contig_migrate_range_info_enabled()) { total_reclaimed += nr_reclaimed; - list_for_each_entry(page, &cc->migratepages, lru) - total_mapped += page_mapcount(page); + list_for_each_entry(page, &cc->migratepages, lru) { + struct folio *folio = page_folio(page); + + total_mapped += folio_mapped(folio) * + folio_nr_pages(folio); + } } ret = migrate_pages(&cc->migratepages, alloc_migration_target, @@ -6341,11 +6450,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc, * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ -int alloc_contig_range(unsigned long start, unsigned long end, +int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - int order; int ret = 0; struct compact_control cc = { @@ -6418,29 +6526,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * We don't have to hold zone->lock here because the pages are * isolated thus they won't get removed from buddy. */ - - order = 0; - outer_start = start; - while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_PAGE_ORDER) { - outer_start = start; - break; - } - outer_start &= ~0UL << order; - } - - if (outer_start != start) { - order = buddy_order(pfn_to_page(outer_start)); - - /* - * outer_start page could be small order buddy page and - * it doesn't include start page. Adjust outer_start - * in this case to report failed page properly - * on tracepoint in test_pages_isolated() - */ - if (outer_start + (1UL << order) <= start) - outer_start = start; - } + outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { @@ -6465,15 +6551,15 @@ done: undo_isolate_page_range(start, end, migratetype); return ret; } -EXPORT_SYMBOL(alloc_contig_range); +EXPORT_SYMBOL(alloc_contig_range_noprof); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, @@ -6528,8 +6614,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * * Return: pointer to contiguous pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; @@ -6660,8 +6746,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -6671,16 +6758,16 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) /* * This function returns a stable result only if called under zone lock. */ -bool is_free_buddy_page(struct page *page) +bool is_free_buddy_page(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned int order; for (order = 0; order < NR_PAGE_ORDERS; order++) { - struct page *page_head = page - (pfn & ((1 << order) - 1)); + const struct page *head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && - buddy_order_unsafe(page_head) >= order) + if (PageBuddy(head) && + buddy_order_unsafe(head) >= order) break; } @@ -6689,6 +6776,14 @@ bool is_free_buddy_page(struct page *page) EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) +{ + __add_to_free_list(page, zone, order, migratetype, tail); + account_freepages(zone, 1 << order, migratetype); +} + /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. @@ -6711,10 +6806,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, current_buddy = page + size; } - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; - add_to_free_list(current_buddy, zone, high, migratetype); + add_to_free_list(current_buddy, zone, high, migratetype, false); set_buddy_order(current_buddy, high); } } @@ -6740,12 +6835,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } @@ -6762,13 +6856,14 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); unsigned long flags; - int migratetype = get_pfnblock_migratetype(page, pfn); bool ret = false; spin_lock_irqsave(&zone->lock, flags); if (put_page_testzero(page)) { + unsigned long pfn = page_to_pfn(page); + int migratetype = get_pfnblock_migratetype(page, pfn); + ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { @@ -6852,7 +6947,7 @@ static bool try_to_accept_memory_one(struct zone *zone) list_del(&page->lru); last = list_empty(&zone->unaccepted_pages); - __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); @@ -6904,7 +6999,7 @@ static bool __free_unaccepted(struct page *page) spin_lock_irqsave(&zone->lock, flags); first = list_empty(&zone->unaccepted_pages); list_add_tail(&page->lru, &zone->unaccepted_pages); - __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_ext.c b/mm/page_ext.c index 4548fcc66d..95dd8ffeaf 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -10,6 +10,7 @@ #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> +#include <linux/pgalloc_tag.h> /* * struct page extension @@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + &page_alloc_tagging_ops, +#endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif @@ -91,7 +95,16 @@ unsigned long page_ext_size; static unsigned long total_usage; +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +/* + * To ensure correct allocation tagging for pages, page_ext should be available + * before the first page allocation. Otherwise early task stacks will be + * allocated before page_ext initialization and missing tags will be flagged. + */ +bool early_page_ext __meminitdata = true; +#else bool early_page_ext __meminitdata; +#endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; @@ -501,7 +514,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ -struct page_ext *page_ext_get(struct page *page) +struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; diff --git a/mm/page_io.c b/mm/page_io.c index ae2b49055e..0a150c240b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(&folio->page); + ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); folio_unlock(folio); @@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a5c8fa4c2a..042937d5ab 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -178,15 +178,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, migratetype, isol_flags); if (!unmovable) { - unsigned long nr_pages; - int mt = get_pageblock_migratetype(page); - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); + if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, - NULL); - - __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); return 0; } @@ -206,7 +202,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ static void unset_migratetype_isolate(struct page *page, int migratetype) { struct zone *zone; - unsigned long flags, nr_pages; + unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; @@ -252,12 +248,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * allocation. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype, NULL); - __mod_zone_freepage_state(zone, nr_pages, migratetype); - } - set_pageblock_migratetype(page, migratetype); - if (isolated_page) + /* + * Isolating this block already succeeded, so this + * should not fail on zone boundaries. + */ + WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); + } else { + set_pageblock_migratetype(page, migratetype); __putback_isolated_page(page, order, migratetype); + } zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); @@ -367,26 +366,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); - /* - * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any - * free pages in [start_pfn, boundary_pfn), its head page will - * always be in the range. - */ + if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) { - /* free page changed before split, check it again */ - if (split_free_page(page, order, boundary_pfn - pfn)) - continue; - } + /* move_freepages_block_isolate() handled this */ + VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; continue; } + /* - * migrate compound pages then let the free page handling code - * above do the rest. If migration is not possible, just fail. + * If a compound page is straddling our block, attempt + * to migrate it out of the way. + * + * We don't have to worry about this creating a large + * free page that straddles into our block: gigantic + * pages are freed as order-0 chunks, and LRU pages + * (currently) do not exceed pageblock_order. + * + * The block of interest has already been marked + * MIGRATE_ISOLATE above, so when migration is done it + * will free its pages onto the correct freelists. */ if (PageCompound(page)) { struct page *head = compound_head(page); @@ -397,16 +399,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, pfn = head_pfn + nr_pages; continue; } + #if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* - * hugetlb, lru compound (THP), and movable compound pages - * can be migrated. Otherwise, fail the isolation. - */ - if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { - int order; - unsigned long outer_pfn; + if (PageHuge(page)) { int page_mt = get_pageblock_migratetype(page); - bool isolate_page = !is_migrate_isolate_page(page); struct compact_control cc = { .nr_migratepages = 0, .order = -1, @@ -419,56 +415,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, }; INIT_LIST_HEAD(&cc.migratepages); - /* - * XXX: mark the page as MIGRATE_ISOLATE so that - * no one else can grab the freed page after migration. - * Ideally, the page should be freed as two separate - * pages to be added into separate migratetype free - * lists. - */ - if (isolate_page) { - ret = set_migratetype_isolate(page, page_mt, - flags, head_pfn, head_pfn + nr_pages); - if (ret) - goto failed; - } - ret = __alloc_contig_migrate_range(&cc, head_pfn, head_pfn + nr_pages, page_mt); - - /* - * restore the page's migratetype so that it can - * be split into separate migratetype free lists - * later. - */ - if (isolate_page) - unset_migratetype_isolate(page, page_mt); - if (ret) goto failed; - /* - * reset pfn to the head of the free page, so - * that the free page handling code above can split - * the free page to the right migratetype list. - * - * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_PAGE_ORDER, but after it is - * freed, the free page order is not. Use pfn within - * the range to find the head of the free page. - */ - order = 0; - outer_pfn = pfn; - while (!PageBuddy(pfn_to_page(outer_pfn))) { - /* stop if we cannot find the free page */ - if (++order > MAX_PAGE_ORDER) - goto failed; - outer_pfn &= ~0UL << order; - } - pfn = outer_pfn; + pfn = head_pfn + nr_pages; continue; - } else + } + + /* + * These pages are movable too, but they're + * not expected to exceed pageblock_order. + * + * Let us know when they do, so we can add + * proper free and split handling for them. + */ + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); #endif - goto failed; + goto failed; } pfn++; diff --git a/mm/page_owner.c b/mm/page_owner.c index 8eed0f3dc0..2d6360eacc 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -168,13 +168,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, unsigned long flags; struct stack *stack; - /* Filter gfp_mask the same way stackdepot does, for consistency */ - gfp_mask &= ~GFP_ZONEMASK; - gfp_mask &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP); - gfp_mask |= __GFP_NOWARN; - set_current_in_page_owner(); - stack = kmalloc(sizeof(*stack), gfp_mask); + stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask)); if (!stack) { unset_current_in_page_owner(); return; @@ -515,7 +510,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, if (!memcg_data) goto out_unlock; - if (memcg_data & MEMCG_DATA_OBJCGS) + if (memcg_data & MEMCG_DATA_OBJEXTS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 6363f93a47..509c6ef8de 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -7,6 +7,8 @@ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> +#include <linux/swap.h> +#include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt @@ -191,6 +193,22 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); +/* Whether the swap entry cached writable information */ +static inline bool swap_cached_writable(swp_entry_t entry) +{ + return is_writable_device_exclusive_entry(entry) || + is_writable_device_private_entry(entry) || + is_writable_migration_entry(entry); +} + +static inline void page_table_check_pte_flags(pte_t pte) +{ + if (pte_present(pte) && pte_uffd_wp(pte)) + WARN_ON_ONCE(pte_write(pte)); + else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) + WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); +} + void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { @@ -199,6 +217,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, if (&init_mm == mm) return; + page_table_check_pte_flags(pte); + for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) @@ -206,11 +226,21 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, } EXPORT_SYMBOL(__page_table_check_ptes_set); +static inline void page_table_check_pmd_flags(pmd_t pmd) +{ + if (pmd_present(pmd) && pmd_uffd_wp(pmd)) + WARN_ON_ONCE(pmd_write(pmd)); + else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) + WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); +} + void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; + page_table_check_pmd_flags(pmd); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 74d2de15fb..ae5cc42aa2 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -314,17 +314,21 @@ next_pte: return false; } +#ifdef CONFIG_MEMORY_FAILURE /** * page_mapped_in_vma - check whether a page is really mapped in a VMA * @page: the page to test * @vma: the VMA to test * - * Returns 1 if the page is mapped into the page tables of the VMA, 0 - * if the page is not mapped into the page tables of this VMA. Only - * valid for normal file or anonymous VMAs. + * Return: The address the page is mapped at if the page is in the range + * covered by the VMA and present in the page table. If the page is + * outside the VMA or not present, returns -EFAULT. + * Only valid for normal file or anonymous VMAs. */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { + struct folio *folio = page_folio(page); + pgoff_t pgoff = folio->index + folio_page_idx(folio, page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, @@ -332,11 +336,13 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_address(page, vma); + pvmw.address = vma_address(vma, pgoff, 1); if (pvmw.address == -EFAULT) - return 0; + goto out; if (!page_vma_mapped_walk(&pvmw)) - return 0; + return -EFAULT; page_vma_mapped_walk_done(&pvmw); - return 1; +out: + return pvmw.address; } +#endif diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index cdd0aa597a..7e42f0ca3b 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -32,6 +32,19 @@ struct pcpu_block_md { int nr_bits; /* total bits responsible for */ }; +struct pcpuobj_ext { +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cgroup; +#endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + union codetag_ref tag; +#endif +}; + +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) +#define NEED_PCPUOBJ_EXT +#endif + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -64,8 +77,8 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ -#ifdef CONFIG_MEMCG_KMEM - struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ +#ifdef NEED_PCPUOBJ_EXT + struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ @@ -74,6 +87,15 @@ struct pcpu_chunk { unsigned long populated[]; /* populated bitmap */ }; +static inline bool need_pcpuobj_ext(void) +{ + if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) + return true; + if (!mem_cgroup_kmem_disabled()) + return true; + return false; +} + extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 2054c9213c..cd69caf6aa 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -231,10 +231,10 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, return 0; err: for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); + if (tcpu == cpu) + break; } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; diff --git a/mm/percpu.c b/mm/percpu.c index 4e11fc1e6d..474e3683b7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ - chunk->obj_cgroups = NULL; + chunk->obj_exts = NULL; #endif pcpu_init_md_blocks(chunk); @@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) if (!chunk->md_blocks) goto md_blocks_fail; -#ifdef CONFIG_MEMCG_KMEM - if (!mem_cgroup_kmem_disabled()) { - chunk->obj_cgroups = +#ifdef NEED_PCPUOBJ_EXT + if (need_pcpuobj_ext()) { + chunk->obj_exts = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * - sizeof(struct obj_cgroup *), gfp); - if (!chunk->obj_cgroups) + sizeof(struct pcpuobj_ext), gfp); + if (!chunk->obj_exts) goto objcg_fail; } #endif @@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) return chunk; -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif @@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; -#ifdef CONFIG_MEMCG_KMEM - pcpu_mem_free(chunk->obj_cgroups); +#ifdef NEED_PCPUOBJ_EXT + pcpu_mem_free(chunk->obj_exts); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); @@ -1646,9 +1646,9 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, if (!objcg) return; - if (likely(chunk && chunk->obj_cgroups)) { + if (likely(chunk && chunk->obj_exts)) { obj_cgroup_get(objcg); - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, @@ -1663,13 +1663,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; - if (unlikely(!chunk->obj_cgroups)) + if (unlikely(!chunk->obj_exts)) return; - objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; if (!objcg) return; - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); @@ -1699,6 +1699,32 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) } #endif /* CONFIG_MEMCG_KMEM */ +#ifdef CONFIG_MEM_ALLOC_PROFILING +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { + alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, + current->alloc_tag, size); + } +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) + alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); +} +#else +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ +} +#endif + /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes @@ -1714,7 +1740,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, +void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; @@ -1881,6 +1907,8 @@ area_found: pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + pcpu_alloc_tag_alloc_hook(chunk, off, size); + return ptr; fail_unlock: @@ -1909,61 +1937,7 @@ fail: return NULL; } - -/** - * __alloc_percpu_gfp - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * @gfp: allocation flags - * - * Allocate zero-filled percpu area of @size bytes aligned at @align. If - * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can - * be called from any context but is a lot more likely to fail. If @gfp - * has __GFP_NOWARN then no warning will be triggered on invalid or failed - * allocation requests. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) -{ - return pcpu_alloc(size, align, false, gfp); -} -EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); - -/** - * __alloc_percpu - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). - */ -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, false, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * __alloc_reserved_percpu - allocate reserved percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Allocate zero-filled percpu area of @size bytes aligned at @align - * from reserved percpu area if arch has set it up; otherwise, - * allocation is served from the same dynamic area. Might sleep. - * Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_reserved_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, true, GFP_KERNEL); -} +EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); /** * pcpu_balance_free - manage the amount of free chunks @@ -2302,6 +2276,8 @@ void free_percpu(void __percpu *ptr) spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); + pcpu_alloc_tag_free_hook(chunk, off, size); + pcpu_memcg_free_hook(chunk, off, size); /* diff --git a/mm/readahead.c b/mm/readahead.c index e5d0a56218..817b2a352d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -228,6 +228,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); + int ret; if (folio && !xa_is_value(folio)) { /* @@ -247,9 +248,12 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; - if (filemap_add_folio(mapping, folio, index + i, - gfp_mask) < 0) { + + ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); + if (ret < 0) { folio_put(folio); + if (ret == -ENOMEM) + break; read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; @@ -23,7 +23,7 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) + * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem @@ -50,7 +50,7 @@ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) - * page->flags PG_locked (lock_page) + * folio_lock */ #include <linux/mm.h> @@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. - * - * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { @@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma) struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; + mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); @@ -775,6 +774,8 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); + pgoff_t pgoff; + if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* @@ -790,7 +791,9 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } - return vma_address(page, vma); + /* The !page__anon_vma above handles KSM folios */ + pgoff = folio->index + folio_page_idx(folio, page); + return vma_address(vma, pgoff, 1); } /* @@ -961,7 +964,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int we_locked = 0; + bool we_locked = false; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, @@ -1128,56 +1131,38 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, if (invalid_mkclean_vma(vma, NULL)) return 0; - pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); + pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } -int folio_total_mapcount(struct folio *folio) -{ - int mapcount = folio_entire_mapcount(folio); - int nr_pages; - int i; - - /* In the common case, avoid the loop when no pages mapped by PTE */ - if (folio_nr_pages_mapped(folio) == 0) - return mapcount; - /* - * Add all the PTE mappings of those pages mapped by PTE. - * Limit the loop to folio_nr_pages_mapped()? - * Perhaps: given all the raciness, that may be a good or a bad idea. - */ - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - mapcount += atomic_read(&folio_page(folio, i)->_mapcount); - - /* But each of those _mapcounts was based on -1 */ - mapcount += nr_pages; - return mapcount; -} - static __always_inline unsigned int __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, enum rmap_level level, int *nr_pmdmapped) { atomic_t *mapped = &folio->_nr_pages_mapped; + const int orig_nr_pages = nr_pages; int first, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: + if (!folio_test_large(folio)) { + nr = atomic_inc_and_test(&page->_mapcount); + break; + } + do { first = atomic_inc_and_test(&page->_mapcount); - if (first && folio_test_large(folio)) { + if (first) { first = atomic_inc_return_relaxed(mapped); - first = (first < ENTIRELY_MAPPED); + if (first < ENTIRELY_MAPPED) + nr++; } - - if (first) - nr++; } while (page++, --nr_pages > 0); + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); @@ -1194,6 +1179,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, nr = 0; } } + atomic_inc(&folio->_large_mapcount); break; } return nr; @@ -1429,10 +1415,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, SetPageAnonExclusive(page); } + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, nr - 1); atomic_set(&folio->_nr_pages_mapped, nr); } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + /* increment count (starts at -1) */ + atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); SetPageAnonExclusive(&folio->page); __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); @@ -1445,13 +1435,14 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { + pg_data_t *pgdat = folio_pgdat(folio); int nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? + __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); @@ -1503,25 +1494,34 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; + pg_data_t *pgdat = folio_pgdat(folio); int last, nr = 0, nr_pmdmapped = 0; + bool partially_mapped = false; enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: + if (!folio_test_large(folio)) { + nr = atomic_add_negative(-1, &page->_mapcount); + break; + } + + atomic_sub(nr_pages, &folio->_large_mapcount); do { last = atomic_add_negative(-1, &page->_mapcount); - if (last && folio_test_large(folio)) { + if (last) { last = atomic_dec_return_relaxed(mapped); - last = (last < ENTIRELY_MAPPED); + if (last < ENTIRELY_MAPPED) + nr++; } - - if (last) - nr++; } while (page++, --nr_pages > 0); + + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: + atomic_dec(&folio->_large_mapcount); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); @@ -1536,17 +1536,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = 0; } } + + partially_mapped = nr < nr_pmdmapped; break; } if (nr_pmdmapped) { + /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */ if (folio_test_anon(folio)) - idx = NR_ANON_THPS; - else if (folio_test_swapbacked(folio)) - idx = NR_SHMEM_PMDMAPPED; + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped); else - idx = NR_FILE_PMDMAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); + __mod_node_page_state(pgdat, + folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, + -nr_pmdmapped); } if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; @@ -1556,10 +1559,12 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. */ - if (folio_test_large(folio) && folio_test_anon(folio)) - if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped) - deferred_split_folio(folio); + if (folio_test_anon(folio) && partially_mapped && + list_empty(&folio->_deferred_list)) + deferred_split_folio(folio); } /* @@ -2588,7 +2593,8 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2649,7 +2655,8 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(&folio->page, vma); + unsigned long address = vma_address(vma, pgoff_start, + folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2702,6 +2709,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && @@ -2716,6 +2724,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio, BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); diff --git a/mm/shmem.c b/mm/shmem.c index 3248432246..c1befe046c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1918,7 +1918,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index); @@ -2278,8 +2278,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; @@ -2289,8 +2287,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, + flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2347,7 +2345,8 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, + inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -4811,7 +4810,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif diff --git a/mm/show_mem.c b/mm/show_mem.c index 8dcfafbd28..bdb439551e 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -423,4 +423,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + { + struct codetag_bytes tags[10]; + size_t i, nr; + + nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); + if (nr) { + pr_notice("Memory allocations:\n"); + for (i = 0; i < nr; i++) { + struct codetag *ct = tags[i].ct; + struct alloc_tag *tag = ct_to_alloc_tag(ct); + struct alloc_tag_counters counter = alloc_tag_read(tag); + + /* Same as alloc_tag_to_text() but w/o intermediate buffer */ + if (ct->modname) + pr_notice("%12lli %8llu %s:%u [%s] func:%s\n", + counter.bytes, counter.calls, ct->filename, + ct->lineno, ct->modname, ct->function); + else + pr_notice("%12lli %8llu %s:%u func:%s\n", + counter.bytes, counter.calls, ct->filename, + ct->lineno, ct->function); + } + } + } +#endif } @@ -84,11 +84,11 @@ struct slab { }; struct rcu_head rcu_head; }; - unsigned int __unused; + unsigned int __page_type; atomic_t __page_refcount; -#ifdef CONFIG_MEMCG - unsigned long memcg_data; +#ifdef CONFIG_SLAB_OBJ_EXT + unsigned long obj_exts; #endif }; @@ -97,8 +97,8 @@ struct slab { SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); -#ifdef CONFIG_MEMCG -SLAB_MATCH(memcg_data, memcg_data); +#ifdef CONFIG_SLAB_OBJ_EXT +SLAB_MATCH(memcg_data, obj_exts); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); @@ -496,9 +496,6 @@ struct slabinfo { }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON @@ -536,42 +533,52 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla return false; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_SLAB_OBJ_EXT + /* - * slab_objcgs - get the object cgroups vector associated with a slab + * slab_obj_exts - get the pointer to the slab object extension vector + * associated with a slab. * @slab: a pointer to the slab struct * - * Returns a pointer to the object cgroups vector associated with the slab, + * Returns a pointer to the object extension vector associated with the slab, * or NULL if no such vector has been associated yet. */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { - unsigned long memcg_data = READ_ONCE(slab->memcg_data); + unsigned long obj_exts = READ_ONCE(slab->obj_exts); - VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), +#ifdef CONFIG_MEMCG + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), slab_page(slab)); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); - - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); +#endif + return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab); -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr); -#else /* CONFIG_MEMCG_KMEM */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); + +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { return NULL; } -static inline int memcg_alloc_slab_cgroups(struct slab *slab, - struct kmem_cache *s, gfp_t gfp, - bool new_slab) +#endif /* CONFIG_SLAB_OBJ_EXT */ + +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { - return 0; + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } -#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p); +void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects, struct slabobj_ext *obj_exts); +#endif size_t __ksize(const void *objp); diff --git a/mm/slab_common.c b/mm/slab_common.c index f5234672f0..1560a1546b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -916,22 +916,15 @@ void __init create_kmalloc_caches(void) * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[type][i]) - new_kmalloc_cache(i, type); - - /* - * Caches that are not of the two-to-the-power-of size. - * These have to be created immediately after the - * earlier power of two caches - */ - if (KMALLOC_MIN_SIZE <= 32 && i == 6 && - !kmalloc_caches[type][1]) - new_kmalloc_cache(1, type); - if (KMALLOC_MIN_SIZE <= 64 && i == 7 && - !kmalloc_caches[type][2]) - new_kmalloc_cache(2, type); - } + /* Caches that are NOT of the two-to-the-power-of size. */ + if (KMALLOC_MIN_SIZE <= 32) + new_kmalloc_cache(1, type); + if (KMALLOC_MIN_SIZE <= 64) + new_kmalloc_cache(2, type); + + /* Caches that are of the two-to-the-power-of size. */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + new_kmalloc_cache(i, type); } #ifdef CONFIG_RANDOM_KMALLOC_CACHES random_kmalloc_seed = get_random_u64(); @@ -1078,7 +1071,6 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.limit, sinfo.batchcount, sinfo.shared); seq_printf(m, " : slabdata %6lu %6lu %6lu", sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); - slabinfo_show_stats(m, s); seq_putc(m, '\n'); } @@ -1155,7 +1147,6 @@ static const struct proc_ops slabinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, - .proc_write = slabinfo_write, .proc_lseek = seq_lseek, .proc_release = seq_release, }; @@ -1189,7 +1180,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; } - ret = kmalloc_track_caller(new_size, flags); + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -1213,7 +1204,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) { void *ret; @@ -1228,7 +1219,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ret; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(krealloc_noprof); /** * kfree_sensitive - Clear sensitive information in memory before freeing @@ -624,11 +624,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* @@ -636,18 +646,12 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) */ static __always_inline void slab_lock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_lock(PG_locked, &page->flags); + bit_spin_lock(PG_locked, &slab->__page_flags); } static __always_inline void slab_unlock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &slab->__page_flags); } static inline bool @@ -1865,198 +1869,279 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif #endif /* CONFIG_SLUB_DEBUG */ -static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +#ifdef CONFIG_SLAB_OBJ_EXT + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; + + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); + /* codetag should be NULL */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } } -#ifdef CONFIG_MEMCG_KMEM -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void mark_failed_objexts_alloc(struct slab *slab) { - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; + slab->obj_exts = OBJEXTS_ALLOC_FAIL; } -static inline size_t obj_full_size(struct kmem_cache *s) +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) { /* - * For each accounted object there is an extra space which is used - * to store obj_cgroup membership. Charge it too. + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. */ - return s->size + sizeof(struct obj_cgroup *); + if (obj_exts & OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } } +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline void mark_failed_objexts_alloc(struct slab *slab) {} +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + /* - * Returns false if the allocation should fail. + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. */ -static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + if (!vec) { + /* Mark vectors which failed to allocate */ + if (new_slab) + mark_failed_objexts_alloc(slab); + + return -ENOMEM; + } + + new_exts = (unsigned long)vec; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif + old_exts = READ_ONCE(slab->obj_exts); + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) || + cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + kfree(vec); + return 0; + } + + kmemleak_not_leak(vec); + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) { + struct slabobj_ext *obj_exts; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + /* - * The obtained objcg pointer is safe to use within the current scope, - * defined by current task or set_active_memcg() pair. - * obj_cgroup_get() is used to get a permanent reference. + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. */ - struct obj_cgroup *objcg = current_obj_cgroup(); - if (!objcg) + mark_objexts_empty(obj_exts); + kfree(obj_exts); + slab->obj_exts = 0; +} + +static inline bool need_slab_obj_ext(void) +{ + if (mem_alloc_profiling_enabled()) return true; - if (lru) { - int ret; - struct mem_cgroup *memcg; + /* + * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally + * inside memcg_slab_post_alloc_hook. No other users for now. + */ + return false; +} - memcg = get_mem_cgroup_from_objcg(objcg); - ret = memcg_list_lru_alloc(memcg, lru, flags); - css_put(&memcg->css); +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + struct slab *slab; - if (ret) - return false; - } + if (!p) + return NULL; - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - return false; + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return NULL; - *objcgp = objcg; - return true; + if (flags & __GFP_NO_OBJ_EXT) + return NULL; + + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + WARN(alloc_slab_obj_exts(slab, s, flags, false), + "%s, %s: Failed to create slab extension vector!\n", + __func__, s->name)) + return NULL; + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); } -/* - * Returns false if the allocation should fail. - */ -static __fastpath_inline -bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, - struct obj_cgroup **objcgp, size_t objects, - gfp_t flags) +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { - if (!memcg_kmem_online()) - return true; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct slabobj_ext *obj_exts; + int i; - if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) - return true; + if (!mem_alloc_profiling_enabled()) + return; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; - return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, - flags)); + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); + + alloc_tag_sub(&obj_exts[off].ref, s->size); + } +#endif } -static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) +#else /* CONFIG_SLAB_OBJ_EXT */ + +static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) { - struct slab *slab; - unsigned long off; - size_t i; + return 0; +} - flags &= gfp_allowed_mask; +static inline void free_slab_obj_exts(struct slab *slab) +{ +} - for (i = 0; i < size; i++) { - if (likely(p[i])) { - slab = virt_to_slab(p[i]); +static inline bool need_slab_obj_ext(void) +{ + return false; +} - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + return NULL; +} - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - } - } +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ } +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEMCG_KMEM + +static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); + static __fastpath_inline -void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, +bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p) { - if (likely(!memcg_kmem_online() || !objcg)) - return; - - return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); -} + if (likely(!memcg_kmem_online())) + return true; -static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, - struct obj_cgroup **objcgs) -{ - for (int i = 0; i < objects; i++) { - struct obj_cgroup *objcg; - unsigned int off; + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; - off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; - if (!objcg) - continue; + if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p))) + return true; - objcgs[off] = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); - obj_cgroup_put(objcg); + if (likely(size == 1)) { + memcg_alloc_abort_single(s, *p); + *p = NULL; + } else { + kmem_cache_free_bulk(s, size, p); } + + return false; } static __fastpath_inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; if (!memcg_kmem_online()) return; - objcgs = slab_objcgs(slab); - if (likely(!objcgs)) + obj_exts = slab_obj_exts(slab); + if (likely(!obj_exts)) return; - __memcg_slab_free_hook(s, slab, p, objects, objcgs); -} - -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) -{ - if (objcg) - obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); + __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } #else /* CONFIG_MEMCG_KMEM */ -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ -} - -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - return true; -} - -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, +static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, gfp_t flags, size_t size, void **p) { + return true; } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } - -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -2111,9 +2196,9 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) return !kasan_slab_free(s, x, init); } -static inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail, - int *cnt) +static __fastpath_inline +bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, + int *cnt) { void *object; @@ -2303,7 +2388,7 @@ static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); + alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); @@ -2312,8 +2397,8 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); + if (memcg_kmem_online() || need_slab_obj_ext()) + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -2609,19 +2694,18 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!partial) { partial = slab; stat(s, ALLOC_FROM_PARTIAL); + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); return partial; @@ -2704,7 +2788,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node, searchnode = numa_mem_id(); slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || node != NUMA_NO_NODE) + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) return slab; return get_any_partial(s, pc); @@ -2802,7 +2886,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, struct slab new; struct slab old; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -3234,6 +3318,43 @@ static unsigned long count_partial(struct kmem_cache_node *n, #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { @@ -3260,7 +3381,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -3380,6 +3501,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct slab *slab; unsigned long flags; struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -3506,6 +3628,21 @@ new_slab: new_objects: pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { @@ -3527,10 +3664,15 @@ new_objects: } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } @@ -3742,10 +3884,7 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) ALLOW_ERROR_INJECTION(should_failslab, ERRNO); static __fastpath_inline -struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -3754,14 +3893,11 @@ struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (unlikely(should_failslab(s, flags))) return NULL; - if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) - return NULL; - return s; } static __fastpath_inline -void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, +bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p, bool init, unsigned int orig_size) { @@ -3808,9 +3944,23 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); +#ifdef CONFIG_MEM_ALLOC_PROFILING + if (need_slab_obj_ext()) { + struct slabobj_ext *obj_exts; + + obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + } +#endif } - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); + return memcg_slab_post_alloc_hook(s, lru, flags, size, p); } /* @@ -3827,10 +3977,9 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; - struct obj_cgroup *objcg = NULL; bool init = false; - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + s = slab_pre_alloc_hook(s, gfpflags); if (unlikely(!s)) return NULL; @@ -3847,13 +3996,15 @@ out: /* * When init equals 'true', like for kzalloc() family, only * @orig_size bytes might be zeroed instead of s->object_size + * In case this fails due to memcg_slab_post_alloc_hook(), + * object is set to NULL */ - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); + slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size); return object; } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, s->object_size); @@ -3862,9 +4013,9 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, @@ -3874,7 +4025,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_lru); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); /** * kmem_cache_alloc_node - Allocate an object on the specified node @@ -3889,7 +4040,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru); * * Return: pointer to the new object or %NULL in case of error */ -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3897,7 +4048,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); /* * To avoid unnecessary overhead, we pass through large allocation requests @@ -3914,7 +4065,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - folio = (struct folio *)alloc_pages_node(node, flags, order); + folio = (struct folio *)alloc_pages_node_noprof(node, flags, order); if (folio) { ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, @@ -3929,7 +4080,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -void *kmalloc_large(size_t size, gfp_t flags) +void *kmalloc_large_noprof(size_t size, gfp_t flags) { void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); @@ -3937,9 +4088,9 @@ void *kmalloc_large(size_t size, gfp_t flags) flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_large); +EXPORT_SYMBOL(kmalloc_large_noprof); -void *kmalloc_large_node(size_t size, gfp_t flags, int node) +void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) { void *ret = __kmalloc_large_node(size, flags, node); @@ -3947,7 +4098,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_large_node); +EXPORT_SYMBOL(kmalloc_large_node_noprof); static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, @@ -3974,26 +4125,26 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, return ret; } -void *__kmalloc_node(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc_node); +EXPORT_SYMBOL(__kmalloc_node_noprof); -void *__kmalloc(size_t size, gfp_t flags) +void *__kmalloc_noprof(size_t size, gfp_t flags) { return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc); +EXPORT_SYMBOL(__kmalloc_noprof); -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) +void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, + int node, unsigned long caller) { return __do_kmalloc_node(size, flags, node, caller); } -EXPORT_SYMBOL(__kmalloc_node_track_caller); +EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, size); @@ -4003,9 +4154,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_trace); +EXPORT_SYMBOL(kmalloc_trace_noprof); -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, +void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); @@ -4015,7 +4166,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_node_trace); +EXPORT_SYMBOL(kmalloc_node_trace_noprof); static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, @@ -4232,7 +4383,7 @@ redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); if (unlikely(slab != c->slab)) { @@ -4282,16 +4433,28 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { memcg_slab_free_hook(s, slab, &object, 1); + alloc_tagging_slab_free_hook(s, slab, &object, 1); if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) do_slab_free(s, slab, object, object, 1, addr); } +#ifdef CONFIG_MEMCG_KMEM +/* Do not inline the rare memcg charging failed path into the allocation path */ +static noinline +void memcg_alloc_abort_single(struct kmem_cache *s, void *object) +{ + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); +} +#endif + static __fastpath_inline void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. @@ -4618,36 +4781,33 @@ error: #endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { int i; - struct obj_cgroup *objcg = NULL; if (!size) return 0; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + s = slab_pre_alloc_hook(s, flags); if (unlikely(!s)) return 0; i = __kmem_cache_alloc_bulk(s, flags, size, p); + if (unlikely(i == 0)) + return 0; /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (likely(i != 0)) { - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); - } else { - memcg_slab_alloc_error_hook(s, size, objcg); + if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size))) { + return 0; } - return i; } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /* @@ -4853,7 +5013,6 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); @@ -5066,9 +5225,7 @@ static int calculate_sizes(struct kmem_cache *s) if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; @@ -5636,7 +5793,8 @@ void __init kmem_cache_init(void) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -5646,7 +5804,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node); @@ -6042,7 +6200,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -6247,7 +6405,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -6261,7 +6419,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); @@ -7095,7 +7253,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -7105,14 +7263,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/sparse.c b/mm/sparse.c index aed0951b87..de40b2c734 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -226,19 +226,6 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en { unsigned long pfn; -#ifdef CONFIG_SPARSEMEM_EXTREME - if (unlikely(!mem_section)) { - unsigned long size, align; - - size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); - } -#endif - start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { @@ -267,6 +254,19 @@ static void __init memblocks_present(void) unsigned long start, end; int i, nid; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc(size, align); + if (!mem_section) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, align); + } +#endif + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) memory_present(nid, start, end); } @@ -560,6 +560,8 @@ void __init sparse_init(void) unsigned long pnum_end, pnum_begin, map_count = 1; int nid_begin; + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); memblocks_present(); pnum_begin = first_present_section_nr(); @@ -112,34 +112,21 @@ static void page_cache_release(struct folio *folio) unlock_page_lruvec_irqrestore(lruvec, flags); } -static void __folio_put_small(struct folio *folio) +void __folio_put(struct folio *folio) { + if (unlikely(folio_is_zone_device(folio))) { + free_zone_device_folio(folio); + return; + } else if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + page_cache_release(folio); + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, 0); -} - -static void __folio_put_large(struct folio *folio) -{ - /* - * __page_cache_release() is supposed to be called for thp, not for - * hugetlb. This is because hugetlb page does never have PageLRU set - * (it's never listed to any LRU lists) and no memcg routines should - * be called for hugetlb (it has a separate hugetlb_cgroup.) - */ - if (!folio_test_hugetlb(folio)) - page_cache_release(folio); - destroy_large_folio(folio); -} - -void __folio_put(struct folio *folio) -{ - if (unlikely(folio_is_zone_device(folio))) - free_zone_device_page(&folio->page); - else if (unlikely(folio_test_large(folio))) - __folio_put_large(folio); - else - __folio_put_small(folio); + free_unref_page(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -158,8 +145,8 @@ void put_pages_list(struct list_head *pages) list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) continue; - if (folio_test_large(folio)) { - __folio_put_large(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ @@ -460,15 +447,18 @@ static void folio_inc_refs(struct folio *folio) } #endif /* CONFIG_LRU_GEN */ -/* - * Mark a page as having seen activity. +/** + * folio_mark_accessed - Mark a folio as having seen activity. + * @folio: The folio to mark. + * + * This function will perform one of the following transitions: * - * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced + * * inactive,unreferenced -> inactive,referenced + * * inactive,referenced -> active,unreferenced + * * active,unreferenced -> active,referenced * - * When a newly allocated page is not yet visible, so safe for non-atomic ops, - * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). + * When a newly allocated folio is not yet visible, so safe for non-atomic ops, + * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { @@ -985,7 +975,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; - if (is_huge_zero_page(&folio->page)) + if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { @@ -993,10 +983,10 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page_refs(&folio->page, nr_refs)) + if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) - free_zone_device_page(&folio->page); + free_zone_device_folio(folio); continue; } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 90973ce788..13ab3b7714 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, - cache->slots, 1); + cache->slots, 0); return cache->nr; } @@ -310,8 +310,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio) entry.val = 0; if (folio_test_large(folio)) { - if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) - get_swap_pages(1, &entry, folio_nr_pages(folio)); + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, &entry, folio_order(folio)); goto out; } @@ -343,7 +343,7 @@ repeat: goto out; } - get_swap_pages(1, &entry, 1); + get_swap_pages(1, &entry, 0); out: if (mem_cgroup_try_charge_swap(folio, entry)) { put_swap_folio(folio, entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index bfc7e8c58a..642c30d837 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -73,11 +73,11 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - struct page *page; + void *shadow; - page = xa_load(&address_space->i_pages, idx); - if (xa_is_value(page)) - return page; + shadow = xa_load(&address_space->i_pages, idx); + if (xa_is_value(shadow)) + return shadow; return NULL; } @@ -301,7 +301,7 @@ void free_page_and_swap_cache(struct page *page) struct folio *folio = page_folio(page); free_swap_cache(folio); - if (!is_huge_zero_page(page)) + if (!is_huge_zero_folio(folio)) folio_put(folio); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 4919423cce..b3e5e384e3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent) /* Reclaim the swap entry if swap is getting full*/ #define TTRS_FULL 0x4 -/* returns 1 if swap entry is freed */ +/* + * returns number of pages in the folio that backs the swap entry. If positive, + * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no + * folio was associated with the swap entry. + */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { @@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, ret = folio_free_swap(folio); folio_unlock(folio); } + ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); folio_put(folio); return ret; } @@ -273,15 +278,15 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR -#define swap_entry_size(size) (size) +#define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* - * Define swap_entry_size() as constant to let compiler to optimize + * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ -#define swap_entry_size(size) 1 +#define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 @@ -343,18 +348,6 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } -static inline bool cluster_is_huge(struct swap_cluster_info *info) -{ - if (IS_ENABLED(CONFIG_THP_SWAP)) - return info->flags & CLUSTER_FLAG_HUGE; - return false; -} - -static inline void cluster_clear_huge(struct swap_cluster_info *info) -{ - info->flags &= ~CLUSTER_FLAG_HUGE; -} - static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -558,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) /* * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased. + * removed from free cluster list and its usage counter will be increased by + * count. */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static void add_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr, + unsigned long count) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; @@ -570,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (cluster_is_free(&cluster_info[idx])) alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + 1); + cluster_count(&cluster_info[idx]) + count); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased by 1. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + add_cluster_info_page(p, cluster_info, page_nr, 1); } /* @@ -602,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, */ static bool scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset) + unsigned long offset, int order) { struct percpu_cluster *percpu_cluster; bool conflict; @@ -616,27 +621,42 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, return false; percpu_cluster = this_cpu_ptr(si->percpu_cluster); - cluster_set_null(&percpu_cluster->index); + percpu_cluster->next[order] = SWAP_NEXT_INVALID; + return true; +} + +static inline bool swap_range_empty(char *swap_map, unsigned int start, + unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < nr_pages; i++) { + if (swap_map[start + i]) + return false; + } + return true; } /* - * Try to get a swap entry from current cpu's swap entry pool (a cluster). This - * might involve allocating a new cluster for current CPU too. + * Try to get swap entries with specified order from current cpu's swap entry + * pool (a cluster). This might involve allocating a new cluster for current CPU + * too. */ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base) + unsigned long *offset, unsigned long *scan_base, int order) { + unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned long tmp, max; + unsigned int tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); - if (cluster_is_null(&cluster->index)) { + tmp = cluster->next[order]; + if (tmp == SWAP_NEXT_INVALID) { if (!cluster_list_empty(&si->free_clusters)) { - cluster->index = si->free_clusters.head; - cluster->next = cluster_next(&cluster->index) * + tmp = cluster_next(&si->free_clusters.head) * SWAPFILE_CLUSTER; } else if (!cluster_list_empty(&si->discard_clusters)) { /* @@ -654,27 +674,27 @@ new_cluster: /* * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster + * check if there is still free entry in the cluster, maintaining + * natural alignment. */ - tmp = cluster->next; - max = min_t(unsigned long, si->max, - (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); if (tmp < max) { ci = lock_cluster(si, tmp); while (tmp < max) { - if (!si->swap_map[tmp]) + if (swap_range_empty(si->swap_map, tmp, nr_pages)) break; - tmp++; + tmp += nr_pages; } unlock_cluster(ci); } if (tmp >= max) { - cluster_set_null(&cluster->index); + cluster->next[order] = SWAP_NEXT_INVALID; goto new_cluster; } - cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + tmp += nr_pages; + cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; return true; } @@ -804,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, - swp_entry_t slots[]) + swp_entry_t slots[], int order) { struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + unsigned int nr_pages = 1 << order; int n_ret = 0; bool scanned_many = false; @@ -825,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * And we let swap pages go all over an SSD partition. Hugh */ + if (order > 0) { + /* + * Should not even be attempting large allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP) || + nr_pages > SWAPFILE_CLUSTER) { + VM_WARN_ON_ONCE(1); + return 0; + } + + /* + * Swapfile is not block device or not using clusters so unable + * to allocate large entries. + */ + if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + return 0; + } + si->flags += SWP_SCANNING; /* * Use percpu scan base for SSD to reduce lock contention on @@ -839,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -859,7 +902,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= si->highest_bit; offset++) { + for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { @@ -882,13 +925,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { /* take a break if we already got some slots */ if (n_ret) goto done; if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base)) + &scan_base, order)) { + if (order > 0) + goto no_page; goto scan; + } } } if (!(si->flags & SWP_WRITEOK)) @@ -907,7 +953,7 @@ checks: swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ - if (swap_was_freed) + if (swap_was_freed > 0) goto checks; goto scan; /* check next one */ } @@ -919,11 +965,11 @@ checks: else goto done; } - WRITE_ONCE(si->swap_map[offset], usage); - inc_cluster_info_page(si, si->cluster_info, offset); + memset(si->swap_map + offset, usage, nr_pages); + add_cluster_info_page(si, si->cluster_info, offset, nr_pages); unlock_cluster(ci); - swap_range_alloc(si, offset, 1); + swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -944,8 +990,10 @@ checks: /* try to get more slots in cluster */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) goto checks; + if (order > 0) + goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; @@ -972,11 +1020,13 @@ checks: } done: - set_cluster_next(si, offset + 1); + if (order == 0) + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; scan: + VM_WARN_ON(order > 0); spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { if (unlikely(--latency_ration < 0)) { @@ -1005,38 +1055,6 @@ no_page: return n_ret; } -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - unsigned long idx; - struct swap_cluster_info *ci; - unsigned long offset; - - /* - * Should not even be attempting cluster allocations when huge - * page swap is disabled. Warn and fail the allocation. - */ - if (!IS_ENABLED(CONFIG_THP_SWAP)) { - VM_WARN_ON_ONCE(1); - return 0; - } - - if (cluster_list_empty(&si->free_clusters)) - return 0; - - idx = cluster_list_first(&si->free_clusters); - offset = idx * SWAPFILE_CLUSTER; - ci = lock_cluster(si, offset); - alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); - - memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); - unlock_cluster(ci); - swap_range_alloc(si, offset, SWAPFILE_CLUSTER); - *slot = swp_entry(si->type, offset); - - return 1; -} - static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) { unsigned long offset = idx * SWAPFILE_CLUSTER; @@ -1050,17 +1068,15 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { - unsigned long size = swap_entry_size(entry_size); + int order = swap_entry_order(entry_order); + unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; - /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; @@ -1096,14 +1112,10 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (size == SWAPFILE_CLUSTER) { - if (si->flags & SWP_BLKDEV) - n_ret = swap_alloc_cluster(si, swp_entries); - } else - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); spin_unlock(&si->lock); - if (n_ret || size == SWAPFILE_CLUSTER) + if (n_ret || size > 1) goto check_out; cond_resched(); @@ -1226,16 +1238,15 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, /* * When we get a swap entry, if there aren't some other ways to - * prevent swapoff, such as the folio in swap cache is locked, page - * table lock is held, etc., the swap entry may become invalid because - * of swapoff. Then, we need to enclose all swap related functions - * with get_swap_device() and put_swap_device(), unless the swap - * functions call get/put_swap_device() by themselves. + * prevent swapoff, such as the folio in swap cache is locked, RCU + * reader side is locked, etc., the swap entry may become invalid + * because of swapoff. Then, we need to enclose all swap related + * functions with get_swap_device() and put_swap_device(), unless the + * swap functions call get/put_swap_device() by themselves. * - * Note that when only holding the PTL, swapoff might succeed immediately - * after freeing a swap entry. Therefore, immediately after - * __swap_entry_free(), the swap info might become stale and should not - * be touched without a prior get_swap_device(). + * RCU reader side lock (including any spinlock) is sufficient to + * prevent swapoff, because synchronize_rcu() is called in swapoff() + * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid @@ -1357,7 +1368,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(folio_nr_pages(folio)); + int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) @@ -1365,7 +1376,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { - VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; @@ -1373,7 +1383,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (val == SWAP_HAS_CACHE) free_entries++; } - cluster_clear_huge(ci); if (free_entries == SWAPFILE_CLUSTER) { unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); @@ -1395,23 +1404,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) unlock_cluster_or_swap_info(si, ci); } -#ifdef CONFIG_THP_SWAP -int split_swap_cluster(swp_entry_t entry) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - - si = _swap_info_get(entry); - if (!si) - return -EBUSY; - ci = lock_cluster(si, offset); - cluster_clear_huge(ci); - unlock_cluster(ci); - return 0; -} -#endif - static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; @@ -1519,22 +1511,23 @@ out: } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, - swp_entry_t entry) + swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; + unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); - unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster_or_swap_info(si, offset); - if (!ci || !cluster_is_huge(ci)) { + if (!ci || nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } - for (i = 0; i < SWAPFILE_CLUSTER; i++) { + for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; @@ -1556,7 +1549,7 @@ static bool folio_swapped(struct folio *folio) if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; - return swap_page_trans_huge_swapped(si, entry); + return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } /** @@ -1602,33 +1595,88 @@ bool folio_free_swap(struct folio *folio) return true; } -/* - * Free the swap entry like above, but also try to - * free the page cache entry if it is the last user. +/** + * free_swap_and_cache_nr() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. + * @entry: First entry of range. + * @nr: Number of entries in range. + * + * For each swap entry in the contiguous range, release a reference. If any swap + * entries become free, try to reclaim their underlying folios, if present. The + * offset range is defined by [entry.offset, entry.offset + nr). */ -int free_swap_and_cache(swp_entry_t entry) +void free_swap_and_cache_nr(swp_entry_t entry, int nr) { - struct swap_info_struct *p; + const unsigned long start_offset = swp_offset(entry); + const unsigned long end_offset = start_offset + nr; + unsigned int type = swp_type(entry); + struct swap_info_struct *si; + bool any_only_cache = false; + unsigned long offset; unsigned char count; if (non_swap_entry(entry)) - return 1; + return; - p = get_swap_device(entry); - if (p) { - if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { - put_swap_device(p); - return 0; + si = get_swap_device(entry); + if (!si) + return; + + if (WARN_ON(end_offset > si->max)) + goto out; + + /* + * First free all entries in the range. + */ + for (offset = start_offset; offset < end_offset; offset++) { + if (data_race(si->swap_map[offset])) { + count = __swap_entry_free(si, swp_entry(type, offset)); + if (count == SWAP_HAS_CACHE) + any_only_cache = true; + } else { + WARN_ON_ONCE(1); } + } + + /* + * Short-circuit the below loop if none of the entries had their + * reference drop to zero. + */ + if (!any_only_cache) + goto out; - count = __swap_entry_free(p, entry); - if (count == SWAP_HAS_CACHE && - !swap_page_trans_huge_swapped(p, entry)) - __try_to_reclaim_swap(p, swp_offset(entry), + /* + * Now go back over the range trying to reclaim the swap cache. This is + * more efficient for large folios because we will only try to reclaim + * the swap once per folio in the common case. If we do + * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * latter will get a reference and lock the folio for every individual + * page but will only succeed once the swap slot for every subpage is + * zero. + */ + for (offset = start_offset; offset < end_offset; offset += nr) { + nr = 1; + if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + /* + * Folios are always naturally aligned in swap so + * advance forward to the next boundary. Zero means no + * folio was found for the swap entry, so advance by 1 + * in this case. Negative value means folio was found + * but could not be reclaimed. Here we can still advance + * to the next boundary. + */ + nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); - put_swap_device(p); + if (nr == 0) + nr = 1; + else if (nr < 0) + nr = -nr; + nr = ALIGN(offset + 1, nr) - offset; + } } - return p != NULL; + +out: + put_swap_device(si); } #ifdef CONFIG_HIBERNATION @@ -1643,7 +1691,7 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); fail: @@ -1806,7 +1854,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); @@ -2396,13 +2444,17 @@ static void reinsert_swap_info(struct swap_info_struct *p) spin_unlock(&swap_lock); } +static bool __has_usable_swap(void) +{ + return !plist_head_empty(&swap_active_head); +} + bool has_usable_swap(void) { - bool ret = true; + bool ret; spin_lock(&swap_lock); - if (plist_head_empty(&swap_active_head)) - ret = false; + ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } @@ -2417,7 +2469,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct inode *inode; struct filename *pathname; int err, found = 0; - unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2495,10 +2546,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) /* * Wait for swap operations protected by get/put_swap_device() - * to complete. - * - * We need synchronize_rcu() here to protect the accessing to - * the swap cache data structure. + * to complete. Because of synchronize_rcu() here, all swap + * operations protected by RCU reader side lock (including any + * spinlock) will be waited too. This makes it easy to + * prevent folio_test_swapcache() and the following swap cache + * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); @@ -2529,7 +2581,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } swap_file = p->swap_file; - old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -2552,11 +2603,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) exit_swap_address_space(p->type); inode = mapping->host; - if (p->bdev_file) { - set_blocksize(p->bdev, old_block_size); - fput(p->bdev_file); - p->bdev_file = NULL; - } inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; @@ -2782,21 +2828,8 @@ static struct swap_info_struct *alloc_swap_info(void) static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) { - int error; - if (S_ISBLK(inode->i_mode)) { - p->bdev_file = bdev_file_open_by_dev(inode->i_rdev, - BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); - if (IS_ERR(p->bdev_file)) { - error = PTR_ERR(p->bdev_file); - p->bdev_file = NULL; - return error; - } - p->bdev = file_bdev(p->bdev_file); - p->old_block_size = block_size(p->bdev); - error = set_blocksize(p->bdev, PAGE_SIZE); - if (error < 0) - return error; + p->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not @@ -3037,7 +3070,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) name = NULL; goto bad_swap; } - swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; @@ -3097,7 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu; + int cpu, i; unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; @@ -3133,8 +3166,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, cpu); - cluster_set_null(&cluster->index); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; } } else { atomic_inc(&nr_rotate_swap); @@ -3234,11 +3269,6 @@ bad_swap: p->percpu_cluster = NULL; free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; - if (p->bdev_file) { - set_blocksize(p->bdev, p->old_block_size); - fput(p->bdev_file); - p->bdev_file = NULL; - } inode = NULL; destroy_swap_extents(p); swap_cgroup_swapoff(p->type); @@ -3659,6 +3689,9 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) if (!(gfp & __GFP_IO)) return; + if (!__has_usable_swap()) + return; + if (!blk_cgroup_congested()) return; diff --git a/mm/truncate.c b/mm/truncate.c index 725b150e47..e99085bf3d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -764,15 +764,15 @@ EXPORT_SYMBOL(truncate_setsize); * @from: original inode size * @to: new inode size * - * Handle extension of inode size either caused by extending truncate or by - * write starting after current i_size. We mark the page straddling current - * i_size RO so that page_mkwrite() is called on the nearest write access to - * the page. This way filesystem can be sure that page_mkwrite() is called on - * the page before user writes to the page via mmap after the i_size has been - * changed. + * Handle extension of inode size either caused by extending truncate or + * by write starting after current i_size. We mark the page straddling + * current i_size RO so that page_mkwrite() is called on the first + * write access to the page. The filesystem will update its per-block + * information before user writes to the page via mmap after the i_size + * has been changed. * * The function must be called after i_size is updated so that page fault - * coming after we unlock the page will already see the new i_size. + * coming after we unlock the folio will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. @@ -781,31 +781,29 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; - struct page *page; - pgoff_t index; + struct folio *folio; WARN_ON(to > inode->i_size); - if (from >= to || bsize == PAGE_SIZE) + if (from >= to || bsize >= PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; - index = from >> PAGE_SHIFT; - page = find_lock_page(inode->i_mapping, index); - /* Page not cached? Nothing to do */ - if (!page) + folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE); + /* Folio not cached? Nothing to do */ + if (IS_ERR(folio)) return; /* - * See clear_page_dirty_for_io() for details why set_page_dirty() + * See folio_clear_dirty_for_io() for details why folio_mark_dirty() * is needed. */ - if (page_mkclean(page)) - set_page_dirty(page); - unlock_page(page); - put_page(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } EXPORT_SYMBOL(pagecache_isize_extended); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 829f7b1089..defa5109cc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,17 +56,16 @@ struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, #ifdef CONFIG_PER_VMA_LOCK /* - * lock_vma() - Lookup and lock vma corresponding to @address. + * uffd_lock_vma() - Lookup and lock vma corresponding to @address. * @mm: mm to search vma in. * @address: address that the vma should contain. * - * Should be called without holding mmap_lock. vma should be unlocked after use - * with unlock_vma(). + * Should be called without holding mmap_lock. * * Return: A locked vma containing @address, -ENOENT if no vma is found, or * -ENOMEM if anon_vma couldn't be allocated. */ -static struct vm_area_struct *lock_vma(struct mm_struct *mm, +static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, unsigned long address) { struct vm_area_struct *vma; @@ -74,9 +73,8 @@ static struct vm_area_struct *lock_vma(struct mm_struct *mm, vma = lock_vma_under_rcu(mm, address); if (vma) { /* - * lock_vma_under_rcu() only checks anon_vma for private - * anonymous mappings. But we need to ensure it is assigned in - * private file-backed vmas as well. + * We know we're going to need to use anon_vma, so check + * that early. */ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) vma_end_read(vma); @@ -107,7 +105,7 @@ static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, { struct vm_area_struct *dst_vma; - dst_vma = lock_vma(dst_mm, dst_start); + dst_vma = uffd_lock_vma(dst_mm, dst_start); if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; @@ -180,9 +178,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page_mapping(page); spinlock_t *ptl; - struct folio *folio; + struct folio *folio = page_folio(page); + bool page_in_cache = folio_mapping(folio); _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -212,7 +210,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; - folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) @@ -1061,7 +1058,7 @@ static int move_present_pte(struct mm_struct *mm, } folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ @@ -1437,7 +1434,7 @@ static int uffd_move_lock(struct mm_struct *mm, struct vm_area_struct *vma; int err; - vma = lock_vma(mm, dst_start); + vma = uffd_lock_vma(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -1452,7 +1449,7 @@ static int uffd_move_lock(struct mm_struct *mm, } /* - * Using lock_vma() to get src_vma can lead to following deadlock: + * Using uffd_lock_vma() to get src_vma can lead to following deadlock: * * Thread1 Thread2 * ------- ------- @@ -1474,7 +1471,7 @@ static int uffd_move_lock(struct mm_struct *mm, err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (!err) { /* - * See comment in lock_vma() as to why not using + * See comment in uffd_lock_vma() as to why not using * vma_start_read() here. */ down_read(&(*dst_vmap)->vm_lock->lock); @@ -1697,9 +1694,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { - struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); + struct folio *folio = pmd_folio(*src_pmd); - if (!folio || (!is_huge_zero_page(&folio->page) && + if (!folio || (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; @@ -124,29 +124,29 @@ EXPORT_SYMBOL(kstrndup); * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup(const void *src, size_t len, gfp_t gfp) +void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; - p = kmalloc_track_caller(len, gfp); + p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } -EXPORT_SYMBOL(kmemdup); +EXPORT_SYMBOL(kmemdup_noprof); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. - * @element_size: size of each element of array. * @count: number of elements to duplicate from array. + * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } @@ -469,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; + set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; + clear_bit(MMF_TOPDOWN, &mm->flags); } #endif @@ -609,7 +609,7 @@ EXPORT_SYMBOL(vm_mmap); * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *kvmalloc_node(size_t size, gfp_t flags, int node) +void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; @@ -631,7 +631,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) kmalloc_flags &= ~__GFP_NOFAIL; } - ret = kmalloc_node(size, kmalloc_flags, node); + ret = kmalloc_node_noprof(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page @@ -656,11 +656,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(kvmalloc_node); +EXPORT_SYMBOL(kvmalloc_node_noprof); /** * kvfree() - Free memory. @@ -699,20 +699,20 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; if (oldsize >= newsize) return (void *)p; - newp = kvmalloc(newsize, flags); + newp = kvmalloc_noprof(newsize, flags); if (!newp) return NULL; memcpy(newp, p, oldsize); kvfree(p); return newp; } -EXPORT_SYMBOL(kvrealloc); +EXPORT_SYMBOL(kvrealloc_noprof); /** * __vmalloc_array - allocate memory for a virtually contiguous array. @@ -720,26 +720,26 @@ EXPORT_SYMBOL(kvrealloc); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return __vmalloc(bytes, flags); + return __vmalloc_noprof(bytes, flags); } -EXPORT_SYMBOL(__vmalloc_array); +EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vmalloc_array(size_t n, size_t size) +void *vmalloc_array_noprof(size_t n, size_t size) { - return __vmalloc_array(n, size, GFP_KERNEL); + return __vmalloc_array_noprof(n, size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_array); +EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. @@ -747,22 +747,22 @@ EXPORT_SYMBOL(vmalloc_array); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vcalloc(size_t n, size_t size, gfp_t flags) +void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { - return __vmalloc_array(n, size, flags | __GFP_ZERO); + return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } -EXPORT_SYMBOL(__vcalloc); +EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vcalloc(size_t n, size_t size) +void *vcalloc_noprof(size_t n, size_t size) { - return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); + return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vcalloc); +EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(struct folio *folio) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 03c78fae06..e34ea86015 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -42,6 +42,7 @@ #include <linux/sched/mm.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include <linux/page_owner.h> #define CREATE_TRACE_POINTS #include <trace/events/vmalloc.h> @@ -96,6 +97,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { pte_t *pte; u64 pfn; + struct page *page; unsigned long size = PAGE_SIZE; pfn = phys_addr >> PAGE_SHIFT; @@ -103,7 +105,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(ptep_get(pte))); + if (!pte_none(ptep_get(pte))) { + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + dump_page(page, "remapping already mapped page"); + } + BUG(); + } #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -714,7 +722,7 @@ int is_vmalloc_or_module_addr(const void *x) * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */ -#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; @@ -1926,15 +1934,25 @@ node_alloc(unsigned long size, unsigned long align, return va; } +static inline void setup_vmalloc_vm(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) +{ + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->vm = vm; +} + /* * Allocate a region of KVA of the specified size and alignment, within the - * vstart and vend. + * vstart and vend. If vm is passed in, the two will also be bound. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask, - unsigned long va_flags) + unsigned long va_flags, struct vm_struct *vm) { struct vmap_node *vn; struct vmap_area *va; @@ -1997,6 +2015,12 @@ retry: va->vm = NULL; va->flags = (va_flags | vn_id); + if (vm) { + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + va->vm = vm; + } + vn = addr_to_node(va->va_start); spin_lock(&vn->busy.lock); @@ -2583,7 +2607,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask, - VMAP_RAM|VMAP_BLOCK); + VMAP_RAM|VMAP_BLOCK, NULL); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2948,7 +2972,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, - node, GFP_KERNEL, VMAP_RAM); + node, GFP_KERNEL, VMAP_RAM, + NULL); if (IS_ERR(va)) return NULL; @@ -3051,26 +3076,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, - struct vmap_area *va, unsigned long flags, const void *caller) -{ - vm->flags = flags; - vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; - vm->caller = caller; - va->vm = vm; -} - -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) -{ - struct vmap_node *vn = addr_to_node(va->va_start); - - spin_lock(&vn->busy.lock); - setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vn->busy.lock); -} - static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* @@ -3107,14 +3112,15 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); + area->flags = flags; + area->caller = caller; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { kfree(area); return NULL; } - setup_vmalloc_vm(area, va, flags, caller); - /* * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a * best-effort approach, as they can be mapped outside of vmalloc code. @@ -3540,12 +3546,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3574,9 +3580,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, break; if (nid == NUMA_NO_NODE) - page = alloc_pages(alloc_gfp, order); + page = alloc_pages_noprof(alloc_gfp, order); else - page = alloc_pages_node(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; @@ -3633,10 +3639,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { - area->pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { @@ -3746,7 +3752,7 @@ fail: * * Return: the address of the area or %NULL on failure */ -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) @@ -3893,10 +3899,10 @@ fail: * * Return: pointer to the allocated memory or %NULL on error */ -void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* @@ -3905,15 +3911,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node); +EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory @@ -3927,12 +3933,12 @@ EXPORT_SYMBOL(__vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages @@ -3946,13 +3952,13 @@ EXPORT_SYMBOL(vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge); +EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** * vzalloc - allocate virtually contiguous memory with zero fill @@ -3967,12 +3973,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace @@ -3983,14 +3989,14 @@ EXPORT_SYMBOL(vzalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -4005,12 +4011,12 @@ EXPORT_SYMBOL(vmalloc_user); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -4023,12 +4029,12 @@ EXPORT_SYMBOL(vmalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) @@ -4051,12 +4057,12 @@ EXPORT_SYMBOL(vzalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -4067,14 +4073,14 @@ EXPORT_SYMBOL(vmalloc_32); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. @@ -4688,7 +4694,7 @@ retry: spin_lock(&vn->busy.lock); insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); spin_unlock(&vn->busy.lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ef654addd..68ac33bea3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -967,7 +967,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, - .nmask = &allowed_mask + .nmask = &allowed_mask, + .reason = MR_DEMOTION, }; if (list_empty(demote_folios)) @@ -1205,25 +1206,28 @@ retry: if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split folios without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - if (!folio_entire_mapcount(folio) && - split_folio_to_list(folio, - folio_list)) + if (data_race(!list_empty(&folio->_deferred_list)) && + split_folio_to_list(folio, folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { + int __maybe_unused order = folio_order(folio); + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, - folio_list)) + if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); - count_vm_event(THP_SWPOUT_FALLBACK); + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; @@ -1256,6 +1260,20 @@ retry: if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; + /* + * Without TTU_SYNC, try_to_unmap will only begin to + * hold PTL from the first present PTE within a large + * folio. Some initial PTEs might be skipped due to + * races with parallel PTE writes in which PTEs can be + * cleared temporarily before being written new present + * values. This will lead to a large folio is still + * mapped while some subpages have been partially + * unmapped after try_to_unmap; TTU_SYNC helps + * try_to_unmap acquire PTL from the first PTE, + * eliminating the influence of temporary PTE values. + */ + if (folio_test_large(folio) && list_empty(&folio->_deferred_list)) + flags |= TTU_SYNC; try_to_unmap(folio, flags); if (folio_mapped(folio)) { @@ -2091,8 +2109,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } static unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat, - bool ignore_references) + struct pglist_data *pgdat) { struct reclaim_stat dummy_stat; unsigned int nr_reclaimed; @@ -2105,7 +2122,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2115,7 +2132,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, return nr_reclaimed; } -unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references) +unsigned long reclaim_pages(struct list_head *folio_list) { int nid; unsigned int nr_reclaimed = 0; @@ -2137,12 +2154,11 @@ unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references continue; } - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), - ignore_references); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -3884,6 +3900,32 @@ done: * working set protection ******************************************************************************/ +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, + * where reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + /* + * The estimation is based on LRU pages only, so cap it to prevent + * overshoots of shrinker objects by large margins. + */ + sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); +} + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; @@ -3917,19 +3959,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); - birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - - if (time_is_after_jiffies(birth + min_ttl)) + if (mem_cgroup_below_min(NULL, memcg)) return false; if (!lruvec_is_sizable(lruvec, sc)) return false; - mem_cgroup_calculate_protection(NULL, memcg); + /* see the comment on lru_gen_folio */ + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - return !mem_cgroup_below_min(NULL, memcg); + return time_is_before_jiffies(birth + min_ttl); } /* to protect the working set of the last N jiffies */ @@ -3939,23 +3979,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + bool reclaimable = !min_ttl; VM_WARN_ON_ONCE(!current_is_kswapd()); - /* check the order to exclude compaction-induced reclaim */ - if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) - return; + set_initial_priority(pgdat, sc); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { - mem_cgroup_iter_break(NULL, memcg); - return; - } + mem_cgroup_calculate_protection(NULL, memcg); - cond_resched(); + if (!reclaimable) + reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); /* @@ -3963,7 +4000,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ - if (mutex_trylock(&oom_lock)) { + if (!reclaimable && mutex_trylock(&oom_lock)) { struct oom_control oc = { .gfp_mask = sc->gfp_mask, }; @@ -4566,7 +4603,6 @@ retry: /* retry folios that may have missed folio_rotate_reclaimable() */ list_move(&folio->lru, &clean); - sc->nr_scanned -= folio_nr_pages(folio); } spin_lock_irq(&lruvec->lru_lock); @@ -4756,8 +4792,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - mem_cgroup_calculate_protection(NULL, memcg); - + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; @@ -4881,28 +4916,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc blk_finish_plug(&plug); } -static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) -{ - int priority; - unsigned long reclaimable; - - if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) - return; - /* - * Determine the initial priority based on - * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, - * where reclaimed_to_scanned_ratio = inactive / total. - */ - reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); - if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) - reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); - - /* round down reclaimable and round up sc->nr_to_reclaim */ - priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); - - sc->priority = clamp(priority, 0, DEF_PRIORITY); -} - static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { struct blk_plug plug; @@ -6686,6 +6699,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, { struct zone *zone; int z; + unsigned long nr_reclaimed = sc->nr_reclaimed; /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; @@ -6713,7 +6727,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; - return sc->nr_scanned >= sc->nr_to_reclaim; + /* account for progress from mm_account_reclaimed_pages() */ + return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; } /* Page allocator PCP high watermark is lowered if reclaim is active. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index db79935e4a..8507c49721 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = { #endif "nr_page_table_pages", "nr_sec_page_table_pages", +#ifdef CONFIG_IOMMU_SUPPORT + "nr_iommu_pages", +#endif #ifdef CONFIG_SWAP "nr_swapcached", #endif diff --git a/mm/workingset.c b/mm/workingset.c index 8a044921ed..a2b28e356e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -626,6 +626,7 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; + struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; @@ -641,12 +642,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); - __inc_lruvec_kmem_state(node, WORKINGSET_NODES); + __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(page, WORKINGSET_NODES); } } } @@ -750,7 +751,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(lru_lock); diff --git a/mm/z3fold.c b/mm/z3fold.c index 7ab0562105..2ebfed3287 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1237,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) } /** - * z3fold_get_pool_size() - gets the z3fold pool size in pages + * z3fold_get_pool_pages() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. */ -static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +static u64 z3fold_get_pool_pages(struct z3fold_pool *pool) { return atomic64_read(&pool->pages_nr); } @@ -1402,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle) z3fold_unmap(pool, handle); } -static u64 z3fold_zpool_total_size(void *pool) +static u64 z3fold_zpool_total_pages(void *pool) { - return z3fold_get_pool_size(pool) * PAGE_SIZE; + return z3fold_get_pool_pages(pool); } static struct zpool_driver z3fold_zpool_driver = { @@ -1417,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = { .free = z3fold_zpool_free, .map = z3fold_zpool_map, .unmap = z3fold_zpool_unmap, - .total_size = z3fold_zpool_total_size, + .total_pages = z3fold_zpool_total_pages, }; MODULE_ALIAS("zpool-z3fold"); @@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) } /** - * zbud_get_pool_size() - gets the zbud pool size in pages + * zbud_get_pool_pages() - gets the zbud pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. The pool lock need not be * taken to access pages_nr. */ -static u64 zbud_get_pool_size(struct zbud_pool *pool) +static u64 zbud_get_pool_pages(struct zbud_pool *pool) { return pool->pages_nr; } @@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle) zbud_unmap(pool, handle); } -static u64 zbud_zpool_total_size(void *pool) +static u64 zbud_zpool_total_pages(void *pool) { - return zbud_get_pool_size(pool) * PAGE_SIZE; + return zbud_get_pool_pages(pool); } static struct zpool_driver zbud_zpool_driver = { @@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = { .free = zbud_zpool_free, .map = zbud_zpool_map, .unmap = zbud_zpool_unmap, - .total_size = zbud_zpool_total_size, + .total_pages = zbud_zpool_total_pages, }; MODULE_ALIAS("zpool-zbud"); diff --git a/mm/zpool.c b/mm/zpool.c index 846410479c..b9fda1fa85 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) } /** - * zpool_get_total_size() - The total size of the pool + * zpool_get_total_pages() - The total size of the pool * @zpool: The zpool to check * - * This returns the total size in bytes of the pool. + * This returns the total size in pages of the pool. * - * Returns: Total size of the zpool in bytes. + * Returns: Total size of the zpool in pages. */ -u64 zpool_get_total_size(struct zpool *zpool) +u64 zpool_get_total_pages(struct zpool *zpool) { - return zpool->driver->total_size(zpool->pool); + return zpool->driver->total_pages(zpool->pool); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7d7cb3eaab..b42d3545ca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -399,9 +399,9 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) zs_unmap_object(pool, handle); } -static u64 zs_zpool_total_size(void *pool) +static u64 zs_zpool_total_pages(void *pool) { - return zs_get_total_pages(pool) << PAGE_SHIFT; + return zs_get_total_pages(pool); } static struct zpool_driver zs_zpool_driver = { @@ -414,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = { .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .total_pages = zs_zpool_total_pages, }; MODULE_ALIAS("zpool-zsmalloc"); diff --git a/mm/zswap.c b/mm/zswap.c index 6f8850c44b..a50e2986cd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -20,7 +20,6 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> -#include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> #include <linux/scatterlist.h> @@ -43,8 +42,6 @@ /********************************* * statistics **********************************/ -/* Total bytes used by the compressed storage */ -u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ @@ -126,19 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* - * Enable/disable handling same-value filled pages (enabled by default). - * If disabled every page is considered non-same-value filled. - */ -static bool zswap_same_filled_pages_enabled = true; -module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, - bool, 0644); - -/* Enable/disable handling non-same-value filled pages (enabled by default) */ -static bool zswap_non_same_filled_pages_enabled = true; -module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, - bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -183,8 +167,6 @@ struct zswap_pool { /* Global LRU lists shared by all zswap pools. */ static struct list_lru zswap_list_lru; -/* counter of pages stored in all zswap pools. */ -static atomic_t zswap_nr_stored = ATOMIC_INIT(0); /* The lock protects zswap_next_shrink updates. */ static DEFINE_SPINLOCK(zswap_shrink_lock); @@ -198,7 +180,6 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both @@ -210,7 +191,6 @@ static struct shrinker *zswap_shrinker; * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { - struct rb_node rbnode; swp_entry_t swpentry; unsigned int length; struct zswap_pool *pool; @@ -222,12 +202,7 @@ struct zswap_entry { struct list_head lru; }; -struct zswap_tree { - struct rb_root rbroot; - spinlock_t lock; -}; - -static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static struct xarray *zswap_trees[MAX_SWAPFILES]; static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ @@ -255,7 +230,7 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) >> SWAP_ADDRESS_SPACE_SHIFT]; @@ -265,45 +240,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static bool zswap_is_full(void) -{ - return totalram_pages() * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static bool zswap_can_accept(void) -{ - return totalram_pages() * zswap_accept_thr_percent / 100 * - zswap_max_pool_percent / 100 > - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static u64 get_zswap_pool_size(struct zswap_pool *pool) -{ - u64 pool_size = 0; - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - pool_size += zpool_get_total_size(pool->zpools[i]); - - return pool_size; -} - -static void zswap_update_total_size(void) -{ - struct zswap_pool *pool; - u64 total = 0; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - total += get_zswap_pool_size(pool); - - rcu_read_unlock(); - - zswap_pool_total_size = total; -} - /********************************* * pool functions **********************************/ @@ -541,6 +477,48 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +static unsigned long zswap_max_pages(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100; +} + +static unsigned long zswap_accept_thr_pages(void) +{ + return zswap_max_pages() * zswap_accept_thr_percent / 100; +} + +unsigned long zswap_total_pages(void) +{ + struct zswap_pool *pool; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) { + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_pages(pool->zpools[i]); + } + rcu_read_unlock(); + + return total; +} + +static bool zswap_check_limits(void) +{ + unsigned long cur_pages = zswap_total_pages(); + unsigned long max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + } else if (zswap_pool_reached_full && + cur_pages <= zswap_accept_thr_pages()) { + zswap_pool_reached_full = false; + } + return zswap_pool_reached_full; +} + /********************************* * param callbacks **********************************/ @@ -807,63 +785,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) } /********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); -} - -/********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; @@ -874,7 +795,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -885,12 +805,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; + return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; } /* @@ -904,7 +819,6 @@ static void zswap_entry_free(struct zswap_entry *entry) else { zswap_lru_del(&zswap_list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap_nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -913,18 +827,6 @@ static void zswap_entry_free(struct zswap_entry *entry) } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); -} - -/* - * The caller hold the tree lock and search the entry from the tree, - * so it must be on the tree, remove it from the tree and free it. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - zswap_rb_erase(&tree->rbroot, entry); - zswap_entry_free(entry); } /********************************* @@ -1126,7 +1028,8 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry) { - struct zswap_tree *tree; + struct xarray *tree; + pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1163,19 +1066,13 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); + if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } - /* Safe to deref entry after the entry is verified above. */ - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); - zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); @@ -1344,8 +1241,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { - nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap_nr_stored); + nr_backing = zswap_total_pages(); + nr_stored = atomic_read(&zswap_stored_pages); } if (!nr_stored) @@ -1365,6 +1262,11 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). + * + * The memory saving factor calculated here takes same-filled pages into + * account, but those are not freeable since they almost occupy no + * space. Hence, we may scale nr_freeable down a little bit more than we + * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1412,6 +1314,10 @@ static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; int ret, failures = 0; + unsigned long thr; + + /* Reclaim down to the accept threshold */ + thr = zswap_accept_thr_pages(); /* global reclaim will select cgroup in a round-robin fashion. */ do { @@ -1459,32 +1365,37 @@ static void shrink_worker(struct work_struct *w) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; - resched: cond_resched(); - } while (!zswap_can_accept()); + } while (zswap_total_pages() > thr); } -static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +/********************************* +* same-filled functions +**********************************/ +static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + bool ret = false; - page = (unsigned long *)ptr; + page = kmap_local_folio(folio, 0); val = page[0]; if (val != page[last_pos]) - return 0; + goto out; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) - return 0; + goto out; } *value = val; - - return 1; + ret = true; +out: + kunmap_local(page); + return ret; } static void zswap_fill_page(void *ptr, unsigned long value) @@ -1495,14 +1406,18 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } +/********************************* +* main API +**********************************/ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); - struct zswap_entry *entry, *dupentry; + struct xarray *tree = swap_zswap_tree(swp); + struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; + unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1514,6 +1429,7 @@ bool zswap_store(struct folio *folio) if (!zswap_enabled) goto check_old; + /* Check cgroup limits */ objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); @@ -1524,19 +1440,8 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* reclaim space if needed */ - if (zswap_is_full()) { - zswap_pool_limit_hit++; - zswap_pool_reached_full = true; - goto shrink; - } - - if (zswap_pool_reached_full) { - if (!zswap_can_accept()) - goto shrink; - else - zswap_pool_reached_full = false; - } + if (zswap_check_limits()) + goto reject; /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); @@ -1545,24 +1450,13 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_same_filled_pages_enabled) { - unsigned long value; - u8 *src; - - src = kmap_local_folio(folio, 0); - if (zswap_is_page_same_filled(src, &value)) { - kunmap_local(src); - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto insert_entry; - } - kunmap_local(src); + if (zswap_is_folio_same_filled(folio, &value)) { + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto store_entry; } - if (!zswap_non_same_filled_pages_enabled) - goto freepage; - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1580,62 +1474,77 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -insert_entry: +store_entry: entry->swpentry = swp; entry->objcg = objcg; + + old = xa_store(tree, offset, entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); + + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); + zswap_reject_alloc_fail++; + goto store_failed; + } + + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); + if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); - /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } - /* map */ - spin_lock(&tree->lock); /* - * The folio may have been dirtied again, invalidate the - * possibly stale entry before inserting the new entry. + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. */ - if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - zswap_invalidate_entry(tree, dupentry); - WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); - } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); - atomic_inc(&zswap_nr_stored); } - spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; +store_failed: + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(zswap_find_zpool(entry), entry->handle); put_pool: - zswap_pool_put(entry->pool); + zswap_pool_put(entry->pool); + } freepage: zswap_entry_cache_free(entry); reject: - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); + if (zswap_pool_reached_full) + queue_work(shrink_wq, &zswap_shrink_work); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate the * possibly stale entry which was previously stored at this offset. * Otherwise, writeback could overwrite the new data in the swapfile. */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); return false; - -shrink: - queue_work(shrink_wq, &zswap_shrink_work); - goto reject; } bool zswap_load(struct folio *folio) @@ -1644,18 +1553,12 @@ bool zswap_load(struct folio *folio) pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; bool swapcache = folio_test_swapcache(folio); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - spin_unlock(&tree->lock); - return false; - } /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1669,8 +1572,12 @@ bool zswap_load(struct folio *folio) * the fault fails. We remain the primary owner of the entry.) */ if (swapcache) - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); + entry = xa_erase(tree, offset); + else + entry = xa_load(tree, offset); + + if (!entry) + return false; if (entry->length) zswap_decompress(entry, page); @@ -1695,19 +1602,17 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); } int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *trees, *tree; + struct xarray *trees, *tree; unsigned int nr, i; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); @@ -1717,11 +1622,8 @@ int zswap_swapon(int type, unsigned long nr_pages) return -ENOMEM; } - for (i = 0; i < nr; i++) { - tree = trees + i; - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - } + for (i = 0; i < nr; i++) + xa_init(trees + i); nr_zswap_trees[type] = nr; zswap_trees[type] = trees; @@ -1730,7 +1632,7 @@ int zswap_swapon(int type, unsigned long nr_pages) void zswap_swapoff(int type) { - struct zswap_tree *trees = zswap_trees[type]; + struct xarray *trees = zswap_trees[type]; unsigned int i; if (!trees) @@ -1738,7 +1640,7 @@ void zswap_swapoff(int type) /* try_to_unuse() invalidated all the entries already */ for (i = 0; i < nr_zswap_trees[type]; i++) - WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + WARN_ON_ONCE(!xa_empty(trees + i)); kvfree(trees); nr_zswap_trees[type] = 0; @@ -1753,6 +1655,13 @@ void zswap_swapoff(int type) static struct dentry *zswap_debugfs_root; +static int debugfs_get_total_size(void *data, u64 *val) +{ + *val = zswap_total_pages() * PAGE_SIZE; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1774,8 +1683,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("pool_total_size", 0444, - zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_file("pool_total_size", 0444, + zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, |