summaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--mm/huge_memory.c150
1 files changed, 92 insertions, 58 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 874000f97b..3f50578eb9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,7 @@
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
+#include <linux/compat.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -65,7 +66,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
-static struct shrinker deferred_split_shrinker;
+static struct shrinker *deferred_split_shrinker;
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
@@ -96,11 +101,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
return in_pf;
/*
- * Special VMA and hugetlb VMA.
+ * khugepaged special VMA and hugetlb VMA.
* Must be checked after dax since some dax mappings may have
* VM_MIXEDMAP set.
*/
- if (vm_flags & VM_NO_KHUGEPAGED)
+ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
return false;
/*
@@ -128,12 +133,18 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
!hugepage_flags_always())))
return false;
- /* Only regular file is valid */
- if (!in_pf && file_thp_enabled(vma))
- return true;
-
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(vma)) {
+ /*
+ * Trust that ->huge_fault() handlers know what they are doing
+ * in fault path.
+ */
+ if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
+ return true;
+ /* Only regular file is valid in collapse path */
+ if (((!in_pf || smaps)) && file_thp_enabled(vma))
+ return true;
return false;
+ }
if (vma_is_temporary_stack(vma))
return false;
@@ -229,11 +240,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker huge_zero_page_shrinker = {
- .count_objects = shrink_huge_zero_page_count,
- .scan_objects = shrink_huge_zero_page_scan,
- .seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *huge_zero_page_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -454,6 +461,38 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
}
#endif /* CONFIG_SYSFS */
+static int __init thp_shrinker_init(void)
+{
+ huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_page_shrinker)
+ return -ENOMEM;
+
+ deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
+ "thp-deferred_split");
+ if (!deferred_split_shrinker) {
+ shrinker_free(huge_zero_page_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
+ huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
+ shrinker_register(huge_zero_page_shrinker);
+
+ deferred_split_shrinker->count_objects = deferred_split_count;
+ deferred_split_shrinker->scan_objects = deferred_split_scan;
+ shrinker_register(deferred_split_shrinker);
+
+ return 0;
+}
+
+static void __init thp_shrinker_exit(void)
+{
+ shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(deferred_split_shrinker);
+}
+
static int __init hugepage_init(void)
{
int err;
@@ -482,12 +521,9 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
- err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
- if (err)
- goto err_hzp_shrinker;
- err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
+ err = thp_shrinker_init();
if (err)
- goto err_split_shrinker;
+ goto err_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -505,10 +541,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
- unregister_shrinker(&deferred_split_shrinker);
-err_split_shrinker:
- unregister_shrinker(&huge_zero_page_shrinker);
-err_hzp_shrinker:
+ thp_shrinker_exit();
+err_shrinker:
khugepaged_destroy();
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
@@ -599,7 +633,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
{
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad, ret;
+ unsigned long len_pad, ret, off_sub;
+
+ if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
+ return 0;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -625,7 +662,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (ret == addr)
return addr;
- ret += (off - ret) & (size - 1);
+ off_sub = (off - ret) & (size - 1);
+
+ if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
+ !off_sub)
+ return ret + size;
+
+ ret += off_sub;
return ret;
}
@@ -1349,7 +1392,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
if (folio_ref_count(folio) == 1) {
pmd_t entry;
- page_move_anon_rmap(page, vma);
+ folio_move_anon_rmap(folio, vma);
+ SetPageAnonExclusive(page);
folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
@@ -1490,9 +1534,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
pmd_t oldpmd = vmf->orig_pmd;
pmd_t pmd;
- struct page *page;
+ struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = NUMA_NO_NODE;
+ int nid = NUMA_NO_NODE;
int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
bool migrated = false, writable = false;
int flags = 0;
@@ -1514,36 +1558,34 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
can_change_pmd_writable(vma, vmf->address, pmd))
writable = true;
- page = vm_normal_page_pmd(vma, haddr, pmd);
- if (!page)
+ folio = vm_normal_folio_pmd(vma, haddr, pmd);
+ if (!folio)
goto out_map;
/* See similar comment in do_numa_page for explanation */
if (!writable)
flags |= TNF_NO_GROUP;
- page_nid = page_to_nid(page);
+ nid = folio_nid(folio);
/*
* For memory tiering mode, cpupid of slow memory page is used
* to record page access time. So use default value.
*/
- if (node_is_toptier(page_nid))
- last_cpupid = page_cpupid_last(page);
- target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
- &flags);
-
+ if (node_is_toptier(nid))
+ last_cpupid = folio_last_cpupid(folio);
+ target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
- put_page(page);
+ folio_put(folio);
goto out_map;
}
spin_unlock(vmf->ptl);
writable = false;
- migrated = migrate_misplaced_page(page, vma, target_nid);
+ migrated = migrate_misplaced_folio(folio, vma, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
- page_nid = target_nid;
+ nid = target_nid;
} else {
flags |= TNF_MIGRATE_FAIL;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1555,9 +1597,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
}
out:
- if (page_nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
- flags);
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
return 0;
@@ -1825,7 +1866,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@@ -1834,7 +1875,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* A protection check is difficult so
* just be safe and disable write
*/
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(swp_offset(entry));
else
entry = make_readable_migration_entry(swp_offset(entry));
@@ -1856,7 +1897,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#endif
if (prot_numa) {
- struct page *page;
+ struct folio *folio;
bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
@@ -1869,8 +1910,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd))
goto unlock;
- page = pmd_page(*pmd);
- toptier = node_is_toptier(page_to_nid(page));
+ folio = page_folio(pmd_page(*pmd));
+ toptier = node_is_toptier(folio_nid(folio));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
@@ -1881,7 +1922,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!toptier)
- xchg_page_access_time(page, jiffies_to_msecs(jiffies));
+ folio_xchg_access_time(folio,
+ jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2483,7 +2525,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
if (page_is_idle(head))
set_page_idle(page_tail);
- page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
/*
* always add to the tail because some iterators expect new
@@ -2791,7 +2833,7 @@ void folio_undo_large_rmappable(struct folio *folio)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ list_del_init(&folio->_deferred_list);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
@@ -2830,7 +2872,7 @@ void deferred_split_folio(struct folio *folio)
#ifdef CONFIG_MEMCG
if (memcg)
set_shrinker_bit(memcg, folio_nid(folio),
- deferred_split_shrinker.id);
+ deferred_split_shrinker->id);
#endif
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2904,14 +2946,6 @@ next:
return split;
}
-static struct shrinker deferred_split_shrinker = {
- .count_objects = deferred_split_count,
- .scan_objects = deferred_split_scan,
- .seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
- SHRINKER_NONSLAB,
-};
-
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{