From 50ba0232fd5312410f1b65247e774244f89a628e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 20:50:36 +0200 Subject: Merging upstream version 6.8.9. Signed-off-by: Daniel Baumann --- mm/memcontrol.c | 376 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 242 insertions(+), 134 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 27c9f451d4..61932c9215 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -64,6 +64,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -573,116 +574,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * memcg and lruvec stats flushing - * - * Many codepaths leading to stats update or read are performance sensitive and - * adding stats flushing in such codepaths is not desirable. So, to optimize the - * flushing the kernel does: - * - * 1) Periodically and asynchronously flush the stats every 2 seconds to not let - * rstat update tree grow unbounded. - * - * 2) Flush the stats synchronously on reader side only when there are more than - * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization - * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but - * only for 2 seconds due to (1). - */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_PER_CPU(unsigned int, stats_updates); -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); -static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static u64 flush_next_time; - -#define FLUSH_TIME (2UL*HZ) - -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) -{ - unsigned int x; - - if (!val) - return; - - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - - x = __this_cpu_add_return(stats_updates, abs(val)); - if (x > MEMCG_CHARGE_BATCH) { - /* - * If stats_flush_threshold exceeds the threshold - * (>num_online_cpus()), cgroup stats update will be triggered - * in __mem_cgroup_flush_stats(). Increasing this var further - * is redundant and simply adds overhead in atomic update. - */ - if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); - __this_cpu_write(stats_updates, 0); - } -} - -static void do_flush_stats(void) -{ - /* - * We always flush the entire tree, so concurrent flushers can just - * skip. This avoids a thundering herd problem on the rstat global lock - * from memcg flushers (e.g. reclaim, refault, etc). - */ - if (atomic_read(&stats_flush_ongoing) || - atomic_xchg(&stats_flush_ongoing, 1)) - return; - - WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME); - - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); - - atomic_set(&stats_flush_threshold, 0); - atomic_set(&stats_flush_ongoing, 0); -} - -void mem_cgroup_flush_stats(void) -{ - if (atomic_read(&stats_flush_threshold) > num_online_cpus()) - do_flush_stats(); -} - -void mem_cgroup_flush_stats_ratelimited(void) -{ - if (time_after64(jiffies_64, READ_ONCE(flush_next_time))) - mem_cgroup_flush_stats(); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - /* - * Always flush here so that flushing in latency-sensitive paths is - * as cheap as possible. - */ - do_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); -} - /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { PGPGIN, @@ -703,6 +594,7 @@ static const unsigned int memcg_vm_event_stat[] = { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, @@ -729,6 +621,15 @@ static inline int memcg_events_index(enum vm_event_item idx) } struct memcg_vmstats_percpu { + /* Stats updates since the last flush */ + unsigned int stats_updates; + + /* Cached pointers for fast iteration in memcg_rstat_updated() */ + struct memcg_vmstats_percpu *parent; + struct memcg_vmstats *vmstats; + + /* The above should fit a single cacheline for memcg_rstat_updated() */ + /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; @@ -740,7 +641,7 @@ struct memcg_vmstats_percpu { /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; -}; +} ____cacheline_aligned; struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ @@ -754,8 +655,133 @@ struct memcg_vmstats { /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; + + /* Stats updates since the last flush */ + atomic64_t stats_updates; }; +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static u64 flush_last_time; + +#define FLUSH_TIME (2UL*HZ) + +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); +} + +static void __memcg_stats_lock(void) +{ + preempt_disable_nested(); +} + +static void memcg_stats_unlock(void) +{ + preempt_enable_nested(); +} + + +static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) +{ + return atomic64_read(&vmstats->stats_updates) > + MEMCG_CHARGE_BATCH * num_online_cpus(); +} + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +{ + struct memcg_vmstats_percpu *statc; + int cpu = smp_processor_id(); + + if (!val) + return; + + cgroup_rstat_updated(memcg->css.cgroup, cpu); + statc = this_cpu_ptr(memcg->vmstats_percpu); + for (; statc; statc = statc->parent) { + statc->stats_updates += abs(val); + if (statc->stats_updates < MEMCG_CHARGE_BATCH) + continue; + + /* + * If @memcg is already flush-able, increasing stats_updates is + * redundant. Avoid the overhead of the atomic update. + */ + if (!memcg_vmstats_needs_flush(statc->vmstats)) + atomic64_add(statc->stats_updates, + &statc->vmstats->stats_updates); + statc->stats_updates = 0; + } +} + +static void do_flush_stats(struct mem_cgroup *memcg) +{ + if (mem_cgroup_is_root(memcg)) + WRITE_ONCE(flush_last_time, jiffies_64); + + cgroup_rstat_flush(memcg->css.cgroup); +} + +/* + * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree + * @memcg: root of the subtree to flush + * + * Flushing is serialized by the underlying global rstat lock. There is also a + * minimum amount of work to be done even if there are no stat updates to flush. + * Hence, we only flush the stats if the updates delta exceeds a threshold. This + * avoids unnecessary work and contention on the underlying lock. + */ +void mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + if (memcg_vmstats_needs_flush(memcg->vmstats)) + do_flush_stats(memcg); +} + +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) +{ + /* Only flush if the periodic flusher is one full cycle late */ + if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) + mem_cgroup_flush_stats(memcg); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + /* + * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing + * in latency-sensitive paths is as cheap as possible. + */ + do_flush_stats(root_mem_cgroup); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); +} + unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { long x = READ_ONCE(memcg->vmstats->state[idx]); @@ -870,16 +896,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; - pg_data_t *pgdat = page_pgdat(page); + pg_data_t *pgdat = folio_pgdat(folio); struct lruvec *lruvec; rcu_read_lock(); - memcg = page_memcg(head); + memcg = folio_memcg(folio); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); @@ -891,7 +916,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, __mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__mod_lruvec_page_state); +EXPORT_SYMBOL(__lruvec_stat_mod_folio); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1627,7 +1652,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) * * Current memory state: */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -2603,8 +2628,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, } /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Reclaims memory over the high limit. Called directly from + * try_charge() (context permitting), as well as from the userland + * return path where reclaim is always able to block. */ void mem_cgroup_handle_over_high(gfp_t gfp_mask) { @@ -2623,6 +2649,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) current->memcg_nr_pages_over_high = 0; retry_reclaim: + /* + * Bail if the task is already exiting. Unlike memory.max, + * memory.high enforcement isn't as strict, and there is no + * OOM killer involved, which means the excess could already + * be much bigger (and still growing) than it could for + * memory.max; the dying task could get stuck in fruitless + * reclaim for a long time, which isn't desirable. + */ + if (task_is_dying()) + goto out; + /* * The allocating task should reclaim at least the batch size, but for * subsequent retries we only want to do what's necessary to prevent oom @@ -2673,6 +2710,9 @@ retry_reclaim: } /* + * Reclaim didn't manage to push usage below the limit, slow + * this allocating task down. + * * If we exit early, we're guaranteed to die (since * schedule_timeout_killable sets TASK_KILLABLE). This means we don't * need to account for any ill-begotten jiffies to pay them off later. @@ -2867,11 +2907,17 @@ done_restock: } } while ((memcg = parent_mem_cgroup(memcg))); + /* + * Reclaim is set up above to be called from the userland + * return path. But also attempt synchronous reclaim to avoid + * excessive overrun while the task is still inside the + * kernel. If this is successful, the return path will see it + * when it rechecks the overage and simply bail out. + */ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && - gfpflags_allow_blocking(gfp_mask)) { + gfpflags_allow_blocking(gfp_mask)) mem_cgroup_handle_over_high(gfp_mask); - } return 0; } @@ -4177,7 +4223,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4258,7 +4304,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4754,7 +4800,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -5150,7 +5196,7 @@ out_kfree: return ret; } -#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* @@ -5259,8 +5305,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#if defined(CONFIG_MEMCG_KMEM) && \ - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_show = mem_cgroup_slab_show, @@ -5437,10 +5482,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) __mem_cgroup_free(memcg); } -static struct mem_cgroup *mem_cgroup_alloc(void) +static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { + struct memcg_vmstats_percpu *statc, *pstatc; struct mem_cgroup *memcg; - int node; + int node, cpu; int __maybe_unused i; long error = -ENOMEM; @@ -5464,6 +5510,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (!memcg->vmstats_percpu) goto fail; + for_each_possible_cpu(cpu) { + if (parent) + pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + statc->parent = parent ? pstatc : NULL; + statc->vmstats = memcg->vmstats; + } + for_each_node(node) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; @@ -5509,7 +5563,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; old_memcg = set_active_memcg(parent); - memcg = mem_cgroup_alloc(); + memcg = mem_cgroup_alloc(parent); set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); @@ -5518,6 +5572,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->zswap_writeback, + !parent || READ_ONCE(parent->zswap_writeback)); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -5614,6 +5670,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + zswap_memcg_offline_cleanup(memcg); + memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); @@ -5784,6 +5842,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) } } } + statc->stats_updates = 0; + /* We are in a per-cpu loop here, only do the atomic write once */ + if (atomic64_read(&memcg->vmstats->stats_updates)) + atomic64_set(&memcg->vmstats->stats_updates, 0); } #ifdef CONFIG_MMU @@ -6783,6 +6845,10 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, return nbytes; } +/* + * Note: don't forget to update the 'samples/cgroup/memcg_event_listener' + * if any new events become available. + */ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) { seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); @@ -6839,7 +6905,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -8085,7 +8151,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) break; } - cgroup_rstat_flush(memcg->css.cgroup); + /* + * mem_cgroup_flush_stats() ignores small changes. Use + * do_flush_stats() directly to get accurate stats for charging. + */ + do_flush_stats(memcg); pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; @@ -8147,11 +8217,19 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) rcu_read_unlock(); } +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return !is_zswap_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); +} + static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - cgroup_rstat_flush(css->cgroup); - return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_flush_stats(memcg); + return memcg_page_state(memcg, MEMCG_ZSWAP_B); } static int zswap_max_show(struct seq_file *m, void *v) @@ -8177,6 +8255,31 @@ static ssize_t zswap_max_write(struct kernfs_open_file *of, return nbytes; } +static int zswap_writeback_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback)); + return 0; +} + +static ssize_t zswap_writeback_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int zswap_writeback; + ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback); + + if (parse_ret) + return parse_ret; + + if (zswap_writeback != 0 && zswap_writeback != 1) + return -EINVAL; + + WRITE_ONCE(memcg->zswap_writeback, zswap_writeback); + return nbytes; +} + static struct cftype zswap_files[] = { { .name = "zswap.current", @@ -8189,6 +8292,11 @@ static struct cftype zswap_files[] = { .seq_show = zswap_max_show, .write = zswap_max_write, }, + { + .name = "zswap.writeback", + .seq_show = zswap_writeback_show, + .write = zswap_writeback_write, + }, { } /* terminate */ }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -- cgit v1.2.3