diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:11:40 +0000 |
commit | 8b0a8165cdad0f4133837d753649ef4682e42c3b (patch) | |
tree | 5c58f869f31ddb1f7bd6e8bdea269b680b36c5b6 /mm/memory.c | |
parent | Releasing progress-linux version 6.8.12-1~progress7.99u1. (diff) | |
download | linux-8b0a8165cdad0f4133837d753649ef4682e42c3b.tar.xz linux-8b0a8165cdad0f4133837d753649ef4682e42c3b.zip |
Merging upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 401 |
1 files changed, 269 insertions, 132 deletions
diff --git a/mm/memory.c b/mm/memory.c index 296e4f05e8..d2155ced45 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * keep things as they are. */ folio_get(folio); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, src_vma); @@ -930,68 +930,111 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 0; } +static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, + pte_t pte, unsigned long addr, int nr) +{ + struct mm_struct *src_mm = src_vma->vm_mm; + + /* If it's a COW mapping, write protect it both processes. */ + if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { + wrprotect_ptes(src_mm, addr, src_pte, nr); + pte = pte_wrprotect(pte); + } + + /* If it's a shared mapping, mark it clean in the child. */ + if (src_vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + if (!userfaultfd_wp(dst_vma)) + pte = pte_clear_uffd_wp(pte); + + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); +} + /* - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page - * is required to copy this pte. + * Copy one present PTE, trying to batch-process subsequent PTEs that map + * consecutive pages of the same folio by copying them as well. + * + * Returns -EAGAIN if one preallocated page is required to copy the next PTE. + * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct folio **prealloc) +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, + int max_nr, int *rss, struct folio **prealloc) { - struct mm_struct *src_mm = src_vma->vm_mm; - unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; + bool any_writable; + fpb_t flags = 0; + int err, nr; page = vm_normal_page(src_vma, addr, pte); - if (page) - folio = page_folio(page); - if (page && folio_test_anon(folio)) { + if (unlikely(!page)) + goto copy_pte; + + folio = page_folio(page); + + /* + * If we likely have to copy, just don't bother with batching. Make + * sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { + if (src_vma->vm_flags & VM_SHARED) + flags |= FPB_IGNORE_DIRTY; + if (!vma_soft_dirty_enabled(src_vma)) + flags |= FPB_IGNORE_SOFT_DIRTY; + + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, + &any_writable); + folio_ref_add(folio, nr); + if (folio_test_anon(folio)) { + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, + nr, src_vma))) { + folio_ref_sub(folio, nr); + return -EAGAIN; + } + rss[MM_ANONPAGES] += nr; + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { + folio_dup_file_rmap_ptes(folio, page, nr); + rss[mm_counter_file(folio)] += nr; + } + if (any_writable) + pte = pte_mkwrite(pte, src_vma); + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, + addr, nr); + return nr; + } + + folio_get(folio); + if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - folio_get(folio); if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + return err ? err : 1; } rss[MM_ANONPAGES]++; - } else if (page) { - folio_get(folio); + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { folio_dup_file_rmap_pte(folio, page); - rss[mm_counter_file(page)]++; - } - - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (is_cow_mapping(vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); - pte = pte_wrprotect(pte); + rss[mm_counter_file(folio)]++; } - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - - if (!userfaultfd_wp(dst_vma)) - pte = pte_clear_uffd_wp(pte); - - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); - return 0; +copy_pte: + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); + return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, @@ -1028,10 +1071,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *src_pte, *dst_pte; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; - int progress, ret = 0; + int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; + int nr; again: progress = 0; @@ -1062,6 +1106,8 @@ again: arch_enter_lazy_mmu_mode(); do { + nr = 1; + /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. @@ -1091,6 +1137,8 @@ again: progress += 8; continue; } + ptent = ptep_get(src_pte); + VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying @@ -1098,9 +1146,10 @@ again: */ WARN_ON_ONCE(ret != -ENOENT); } - /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, &prealloc); + /* copy_present_ptes() will clear `*prealloc' if consumed */ + max_nr = (end - addr) / PAGE_SIZE; + ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, + ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1117,8 +1166,10 @@ again: folio_put(prealloc); prealloc = NULL; } - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + nr = ret; + progress += 8 * nr; + } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, + addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); @@ -1139,7 +1190,7 @@ again: prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; - } else if (ret) { + } else if (ret < 0) { VM_WARN_ON_ONCE(1); } @@ -1369,19 +1420,16 @@ static inline bool should_zap_cows(struct zap_details *details) return details->even_cows; } -/* Decides whether we should zap this page with the page pointer specified */ -static inline bool should_zap_page(struct zap_details *details, struct page *page) +/* Decides whether we should zap this folio with the folio pointer specified */ +static inline bool should_zap_folio(struct zap_details *details, + struct folio *folio) { - /* If we can make a decision without *page.. */ + /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; - /* E.g. the caller passes NULL for the case of a zero page */ - if (!page) - return true; - - /* Otherwise we should only zap non-anon pages */ - return !PageAnon(page); + /* Otherwise we should only zap non-anon folios */ + return !folio_test_anon(folio); } static inline bool zap_drop_file_uffd_wp(struct zap_details *details) @@ -1398,7 +1446,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details) */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte, + unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ @@ -1408,7 +1456,113 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, if (zap_drop_file_uffd_wp(details)) return; - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + for (;;) { + /* the PFN in the PTE is irrelevant. */ + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (--nr == 0) + break; + pte++; + addr += PAGE_SIZE; + } +} + +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct folio *folio, + struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + unsigned long addr, struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) +{ + struct mm_struct *mm = tlb->mm; + bool delay_rmap = false; + + if (!folio_test_anon(folio)) { + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap = true; + *force_flush = true; + } + } + if (pte_young(ptent) && likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + rss[mm_counter(folio)] -= nr; + } else { + /* We don't need up-to-date accessed/dirty bits. */ + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + rss[MM_ANONPAGES] -= nr; + } + /* Checking a single PTE in a batch is sufficient. */ + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entries(tlb, pte, nr, addr); + if (unlikely(userfaultfd_pte_wp(vma, ptent))) + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, + ptent); + + if (!delay_rmap) { + folio_remove_rmap_ptes(folio, page, nr, vma); + + /* Only sanity-check the first page in a batch. */ + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { + *force_flush = true; + *force_break = true; + } +} + +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + struct mm_struct *mm = tlb->mm; + struct folio *folio; + struct page *page; + int nr; + + page = vm_normal_page(vma, addr, ptent); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + if (userfaultfd_pte_wp(vma, ptent)) + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, + details, ptent); + ksm_might_unmap_zero_page(mm, ptent); + return 1; + } + + folio = page_folio(page); + if (unlikely(!should_zap_folio(details, folio))) + return 1; + + /* + * Make sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, + force_break); + return nr; + } + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + details, rss, force_flush, force_break); + return 1; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1416,13 +1570,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details) { + bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; - int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int nr; tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); @@ -1436,7 +1591,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t ptent = ptep_get(pte); struct folio *folio; struct page *page; + int max_nr; + nr = 1; if (pte_none(ptent)) continue; @@ -1444,44 +1601,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - unsigned int delay_rmap; - - page = vm_normal_page(vma, addr, ptent); - if (unlikely(!should_zap_page(details, page))) - continue; - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, - ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - continue; - } - - folio = page_folio(page); - delay_rmap = 0; - if (!folio_test_anon(folio)) { - if (pte_dirty(ptent)) { - folio_mark_dirty(folio); - if (tlb_delay_rmap(tlb)) { - delay_rmap = 1; - force_flush = 1; - } - } - if (pte_young(ptent) && likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - rss[mm_counter(page)]--; - if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { - force_flush = 1; - addr += PAGE_SIZE; + max_nr = (end - addr) / PAGE_SIZE; + nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, + addr, details, rss, &force_flush, + &force_break); + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; break; } continue; @@ -1492,7 +1617,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); - if (unlikely(!should_zap_page(details, page))) + if (unlikely(!should_zap_folio(details, folio))) continue; /* * Both device private/exclusive mappings should only @@ -1501,7 +1626,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (is_device_private_entry(entry)) folio_remove_rmap_pte(folio, page, vma); folio_put(folio); @@ -1513,10 +1638,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - if (!should_zap_page(details, page)) + folio = pfn_swap_entry_folio(entry); + if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only @@ -1535,8 +1660,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - } while (pte++, addr += PAGE_SIZE, addr != end); + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1870,7 +1995,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3081,7 +3206,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } -static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3175,7 +3300,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(&old_folio->page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { @@ -3253,7 +3378,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(new_folio); if (old_folio) { if (page_copied) - free_swap_cache(&old_folio->page); + free_swap_cache(old_folio); folio_put(old_folio); } @@ -3376,6 +3501,16 @@ static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { /* + * We could currently only reuse a subpage of a large folio if no + * other subpages of the large folios are still mapped. However, + * let's just consistently not reuse subpages even if we could + * reuse in that scenario, and give back a large folio a bit + * sooner. + */ + if (folio_test_large(folio)) + return false; + + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. @@ -4170,8 +4305,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4223,15 +4358,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4298,10 +4439,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4355,8 +4492,6 @@ unlock: release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } @@ -4480,7 +4615,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* @@ -4543,7 +4678,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4653,7 +4788,8 @@ static int fault_around_bytes_set(void *data, u64 val) * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ - fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); + val = max(val, PAGE_SIZE); + fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } @@ -4928,18 +5064,18 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int flags = 0; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. + * The pte cannot be used safely until we verify, while holding the page + * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + /* Read the live PTE from the page tables: */ + old_pte = ptep_get(vmf->pte); + + if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Get the normal PTE */ - old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); /* @@ -6167,7 +6303,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg) { struct page *page = arg; - clear_user_highpage(page + idx, addr); + clear_user_highpage(nth_page(page, idx), addr); return 0; } @@ -6217,10 +6353,11 @@ struct copy_subpage_arg { static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; + struct page *dst = nth_page(copy_arg->dst, idx); + struct page *src = nth_page(copy_arg->src, idx); - if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, - addr, copy_arg->vma)) { - memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); + if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; } return 0; |