summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c135
1 files changed, 75 insertions, 60 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ef654addd..68ac33bea3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -967,7 +967,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
- .nmask = &allowed_mask
+ .nmask = &allowed_mask,
+ .reason = MR_DEMOTION,
};
if (list_empty(demote_folios))
@@ -1205,25 +1206,28 @@ retry:
if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
- * Split folios without a PMD map right
- * away. Chances are some or all of the
- * tail pages can be freed without IO.
+ * Split partially mapped folios right away.
+ * We can free the unmapped pages without IO.
*/
- if (!folio_entire_mapcount(folio) &&
- split_folio_to_list(folio,
- folio_list))
+ if (data_race(!list_empty(&folio->_deferred_list)) &&
+ split_folio_to_list(folio, folio_list))
goto activate_locked;
}
if (!add_to_swap(folio)) {
+ int __maybe_unused order = folio_order(folio);
+
if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
- if (split_folio_to_list(folio,
- folio_list))
+ if (split_folio_to_list(folio, folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
- count_vm_event(THP_SWPOUT_FALLBACK);
+ if (nr_pages >= HPAGE_PMD_NR) {
+ count_memcg_folio_events(folio,
+ THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
+ }
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(folio))
goto activate_locked_split;
@@ -1256,6 +1260,20 @@ retry:
if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD;
+ /*
+ * Without TTU_SYNC, try_to_unmap will only begin to
+ * hold PTL from the first present PTE within a large
+ * folio. Some initial PTEs might be skipped due to
+ * races with parallel PTE writes in which PTEs can be
+ * cleared temporarily before being written new present
+ * values. This will lead to a large folio is still
+ * mapped while some subpages have been partially
+ * unmapped after try_to_unmap; TTU_SYNC helps
+ * try_to_unmap acquire PTL from the first PTE,
+ * eliminating the influence of temporary PTE values.
+ */
+ if (folio_test_large(folio) && list_empty(&folio->_deferred_list))
+ flags |= TTU_SYNC;
try_to_unmap(folio, flags);
if (folio_mapped(folio)) {
@@ -2091,8 +2109,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
static unsigned int reclaim_folio_list(struct list_head *folio_list,
- struct pglist_data *pgdat,
- bool ignore_references)
+ struct pglist_data *pgdat)
{
struct reclaim_stat dummy_stat;
unsigned int nr_reclaimed;
@@ -2105,7 +2122,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2115,7 +2132,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
return nr_reclaimed;
}
-unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
+unsigned long reclaim_pages(struct list_head *folio_list)
{
int nid;
unsigned int nr_reclaimed = 0;
@@ -2137,12 +2154,11 @@ unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references
continue;
}
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
- ignore_references);
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
nid = folio_nid(lru_to_folio(folio_list));
} while (!list_empty(folio_list));
- nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -3884,6 +3900,32 @@ done:
* working set protection
******************************************************************************/
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
@@ -3917,19 +3959,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
- mem_cgroup_calculate_protection(NULL, memcg);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@@ -3939,23 +3979,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);
- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@@ -3963,7 +4000,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4566,7 +4603,6 @@ retry:
/* retry folios that may have missed folio_rotate_reclaimable() */
list_move(&folio->lru, &clean);
- sc->nr_scanned -= folio_nr_pages(folio);
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4756,8 +4792,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
@@ -4881,28 +4916,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
- int priority;
- unsigned long reclaimable;
-
- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
- return;
- /*
- * Determine the initial priority based on
- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
- * where reclaimed_to_scanned_ratio = inactive / total.
- */
- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
- /* round down reclaimable and round up sc->nr_to_reclaim */
- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
- sc->priority = clamp(priority, 0, DEF_PRIORITY);
-}
-
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
@@ -6686,6 +6699,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
{
struct zone *zone;
int z;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
@@ -6713,7 +6727,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
- return sc->nr_scanned >= sc->nr_to_reclaim;
+ /* account for progress from mm_account_reclaimed_pages() */
+ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
}
/* Page allocator PCP high watermark is lowered if reclaim is active. */