summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c4
-rw-r--r--mm/huge_memory.c77
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/kmsan/core.c15
-rw-r--r--mm/ksm.c17
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol.c3
-rw-r--r--mm/memory-failure.c11
-rw-r--r--mm/page_table_check.c11
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/userfaultfd.c35
-rw-r--r--mm/vmalloc.c5
13 files changed, 143 insertions, 68 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 01f5a8f71d..3e9724716b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,10 +182,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* alignment should be aligned with order_per_bit */
- if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit))
- return -EINVAL;
-
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89f58c7603..769e8a125f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2493,32 +2493,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2529,6 +2508,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
@@ -3055,30 +3058,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (new_order >= folio_order(folio))
return -EINVAL;
- /* Cannot split anonymous THP to order-1 */
- if (new_order == 1 && folio_test_anon(folio)) {
- VM_WARN_ONCE(1, "Cannot split to order-1 folio");
- return -EINVAL;
- }
-
- if (new_order) {
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio))
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ if (new_order == 1) {
+ VM_WARN_ONCE(1, "Cannot split to order-1 folio");
return -EINVAL;
+ }
+ } else if (new_order) {
/* Split shmem folio to non-zero order not supported */
if (shmem_mapping(folio->mapping)) {
VM_WARN_ONCE(1,
"Cannot split shmem folio to non-0 order");
return -EINVAL;
}
- /* No split if the file system does not support large folio */
- if (!mapping_large_folio_support(folio->mapping)) {
+ /*
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
+ */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
VM_WARN_ONCE(1,
"Cannot split file folio to non-0 order");
return -EINVAL;
}
}
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio) && new_order)
+ return -EINVAL;
is_hzp = is_huge_zero_page(&folio->page);
if (is_hzp) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ce7be5c244..c445e6fd85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5774,8 +5774,20 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* do_exit() will not see it, and will keep the reservation
* forever.
*/
- if (adjust_reservation && vma_needs_reservation(h, vma, address))
- vma_add_reservation(h, vma, address);
+ if (adjust_reservation) {
+ int rc = vma_needs_reservation(h, vma, address);
+
+ if (rc < 0)
+ /* Pressumably allocate_file_region_entries failed
+ * to allocate a file_region struct. Clear
+ * hugetlb_restore_reserve so that global reserve
+ * count will not be incremented by free_huge_folio.
+ * Act as if we consumed the reservation.
+ */
+ folio_clear_hugetlb_restore_reserve(page_folio(page));
+ else if (rc)
+ vma_add_reservation(h, vma, address);
+ }
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
@@ -7867,9 +7879,9 @@ void __init hugetlb_cma_reserve(int order)
* huge page demotion.
*/
res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << HUGETLB_PAGE_ORDER,
- 0, false, name,
- &hugetlb_cma[nid], nid);
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
+ HUGETLB_PAGE_ORDER, false, name,
+ &hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
res, nid);
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index cf2d70e9c9..95f859e38c 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -196,8 +196,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
u32 origin, bool checked)
{
u64 address = (u64)addr;
- void *shadow_start;
- u32 *origin_start;
+ u32 *shadow_start, *origin_start;
size_t pad = 0;
KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -225,8 +224,16 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
origin_start =
(u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN);
- for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++)
- origin_start[i] = origin;
+ /*
+ * If the new origin is non-zero, assume that the shadow byte is also non-zero,
+ * and unconditionally overwrite the old origin slot.
+ * If the new origin is zero, overwrite the old origin slot iff the
+ * corresponding shadow slot is zero.
+ */
+ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
+ if (origin || !shadow_start[i])
+ origin_start[i] = origin;
+ }
}
struct page *kmsan_vmalloc_to_page_or_null(void *vaddr)
diff --git a/mm/ksm.c b/mm/ksm.c
index 8c001819cf..ddd482c718 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -296,7 +296,7 @@ static bool ksm_use_zero_pages __read_mostly;
static bool ksm_smart_scan = true;
/* The number of zero pages which is placed by KSM */
-unsigned long ksm_zero_pages;
+atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);
/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;
@@ -1428,8 +1428,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* the dirty bit in zero page's PTE is set.
*/
newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
- ksm_zero_pages++;
- mm->ksm_zero_pages++;
+ ksm_map_zero_page(mm);
/*
* We're replacing an anonymous page with a zero page, which is
* not anonymous. We need to do proper accounting otherwise we
@@ -2747,18 +2746,16 @@ static void ksm_do_scan(unsigned int scan_npages)
{
struct ksm_rmap_item *rmap_item;
struct page *page;
- unsigned int npages = scan_npages;
- while (npages-- && likely(!freezing(current))) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
cmp_and_merge_page(page, rmap_item);
put_page(page);
+ ksm_pages_scanned++;
}
-
- ksm_pages_scanned += scan_npages - npages;
}
static int ksmd_should_run(void)
@@ -3370,7 +3367,7 @@ static void wait_while_offlining(void)
#ifdef CONFIG_PROC_FS
long ksm_process_profit(struct mm_struct *mm)
{
- return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
+ return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */
@@ -3659,7 +3656,7 @@ KSM_ATTR_RO(pages_skipped);
static ssize_t ksm_zero_pages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
}
KSM_ATTR_RO(ksm_zero_pages);
@@ -3668,7 +3665,7 @@ static ssize_t general_profit_show(struct kobject *kobj,
{
long general_profit;
- general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
+ general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
ksm_rmap_items * sizeof(struct ksm_rmap_item);
return sysfs_emit(buf, "%ld\n", general_profit);
diff --git a/mm/memblock.c b/mm/memblock.c
index d09136e040..08e9806b1c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1339,6 +1339,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
int start_rgn, end_rgn;
int i, ret;
+ if (WARN_ONCE(nid == MAX_NUMNODES,
+ "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
if (ret)
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fabce2b50c..612558f306 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7531,8 +7531,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
* @new: Replacement folio.
*
* Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free. This is only used by the page cache
- * (in replace_page_cache_folio()).
+ * be uncharged upon free.
*
* Both folios must be locked, @new->mapping must be set up.
*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9e62a00b46..7751bd78fb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1218,7 +1218,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
* subpages.
*/
folio_put(folio);
- if (__page_handle_poison(p) >= 0) {
+ if (__page_handle_poison(p) > 0) {
page_ref_inc(p);
res = MF_RECOVERED;
} else {
@@ -2097,7 +2097,7 @@ retry:
*/
if (res == 0) {
folio_unlock(folio);
- if (__page_handle_poison(p) >= 0) {
+ if (__page_handle_poison(p) > 0) {
page_ref_inc(p);
res = MF_RECOVERED;
} else {
@@ -2550,6 +2550,13 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
+ if (is_huge_zero_page(&folio->page)) {
+ unpoison_pr_info("Unpoison: huge zero page is not supported %#lx\n",
+ pfn, &unpoison_rs);
+ ret = -EOPNOTSUPP;
+ goto unlock_mutex;
+ }
+
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index af69c3c8f7..6363f93a47 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -71,6 +71,9 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+ if (!page_ext)
+ return;
+
BUG_ON(PageSlab(page));
anon = PageAnon(page);
@@ -108,6 +111,9 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+ if (!page_ext)
+ return;
+
BUG_ON(PageSlab(page));
anon = PageAnon(page);
@@ -138,7 +144,10 @@ void __page_table_check_zero(struct page *page, unsigned int order)
BUG_ON(PageSlab(page));
page_ext = page_ext_get(page);
- BUG_ON(!page_ext);
+
+ if (!page_ext)
+ return;
+
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc..a78a4adf71 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 94ab99b6b5..b66e99e2c7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1786,7 +1786,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- mem_cgroup_migrate(old, new);
+ mem_cgroup_replace_folio(old, new);
__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
@@ -3467,8 +3467,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
return error;
}
- simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
- error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+ error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
if (error)
return error;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3c3539c573..829f7b1089 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -316,6 +316,38 @@ out_release:
goto out;
}
+static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct folio *folio;
+ int ret = -ENOMEM;
+
+ folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
+ if (!folio)
+ return ret;
+
+ if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
+ goto out_put;
+
+ /*
+ * The memory barrier inside __folio_mark_uptodate makes sure that
+ * zeroing out the folio become visible before mapping the page
+ * using set_pte_at(). See do_anonymous_page().
+ */
+ __folio_mark_uptodate(folio);
+
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, 0);
+ if (ret)
+ goto out_put;
+
+ return 0;
+out_put:
+ folio_put(folio);
+ return ret;
+}
+
static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
spinlock_t *ptl;
int ret;
+ if (mm_forbids_zeropage(dst_vma->vm_mm))
+ return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
+
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
ret = -EAGAIN;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 125427cbdb..109272b8ee 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3492,7 +3492,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
{
unsigned int nr_allocated = 0;
gfp_t alloc_gfp = gfp;
- bool nofail = false;
+ bool nofail = gfp & __GFP_NOFAIL;
struct page *page;
int i;
@@ -3549,12 +3549,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* and compaction etc.
*/
alloc_gfp &= ~__GFP_NOFAIL;
- nofail = true;
}
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
- if (fatal_signal_pending(current))
+ if (!nofail && fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)