diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:46 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-07 13:17:46 +0000 |
commit | 7f3a4257159dea8e7ef66d1a539dc6df708b8ed3 (patch) | |
tree | bcc69b5f4609f348fac49e2f59e210b29eaea783 /mm/memory.c | |
parent | Adding upstream version 6.9.12. (diff) | |
download | linux-7f3a4257159dea8e7ef66d1a539dc6df708b8ed3.tar.xz linux-7f3a4257159dea8e7ef66d1a539dc6df708b8ed3.zip |
Adding upstream version 6.10.3.upstream/6.10.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 333 |
1 files changed, 179 insertions, 154 deletions
diff --git a/mm/memory.c b/mm/memory.c index 4bd6d68f1b..f81760c938 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf); * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ -static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) +static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { + if (!userfaultfd_wp(vmf->vma)) + return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; @@ -989,7 +991,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable); + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1502,8 +1504,7 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); - /* Only sanity-check the first page in a batch. */ - if (unlikely(page_mapcount(page) < 0)) + if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { @@ -1553,7 +1554,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL); + NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, @@ -1631,12 +1632,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { - /* Genuine swap entry, hence a private anon page */ + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) continue; - rss[MM_SWAPENTS]--; - if (unlikely(!free_swap_and_cache(entry))) - print_bad_pte(vma, addr, ptent, NULL); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) @@ -1659,8 +1661,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); @@ -2765,7 +2767,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err = 0; - BUG_ON(pud_huge(*pud)); + BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); @@ -3206,19 +3208,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } +/** + * vmf_anon_prepare - Prepare to handle an anonymous fault. + * @vmf: The vm_fault descriptor passed from the fault handler. + * + * When preparing to insert an anonymous page into a VMA from a + * fault handler, call this function rather than anon_vma_prepare(). + * If this vma does not already have an associated anon_vma and we are + * only protected by the per-VMA lock, the caller must retry with the + * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to + * determine if this VMA can share its anon_vma, and that's not safe to + * do with only the per-VMA lock held for this VMA. + * + * Return: 0 if fault handling can proceed. Any other value should be + * returned to the caller. + */ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; + if (!mmap_read_trylock(vma->vm_mm)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } } if (__anon_vma_prepare(vma)) - return VM_FAULT_OOM; - return 0; + ret = VM_FAULT_OOM; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + mmap_read_unlock(vma->vm_mm); + return ret; } /* @@ -3329,13 +3351,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); - /* - * We call the notify macro here because, when using secondary - * mmu page tables (such as kvm shadow page tables), we want the - * new page to be mapped directly into the secondary page table. - */ BUG_ON(unshare && pte_write(entry)); - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* @@ -4190,7 +4207,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. @@ -4326,8 +4343,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4352,6 +4369,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) pte_unmap(pte); + if (!orders) + goto fallback; + /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { @@ -4359,6 +4379,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } @@ -4367,6 +4388,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) return folio; } next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } @@ -4382,7 +4404,6 @@ fallback: */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; @@ -4427,8 +4448,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } /* Allocate our own private page. */ - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) @@ -4476,10 +4498,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: - if (uffd_wp) + if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); @@ -4655,9 +4680,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; - bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); + bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); @@ -4670,16 +4694,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (unlikely(uffd_wp)) + if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4716,9 +4738,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page; vm_fault_t ret; + bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && + !(vma->vm_flags & VM_SHARED); /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if (is_cow) page = vmf->cow_page; else page = vmf->page; @@ -4754,8 +4778,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { struct folio *folio = page_folio(page); + int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); set_pte_range(vmf, folio, page, 1, vmf->address); + add_mm_counter(vma->vm_mm, type, 1); ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); @@ -5036,9 +5062,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int page_nid, int *flags) { + struct vm_area_struct *vma = vmf->vma; + folio_get(folio); /* Record the current PID acceesing VMA */ @@ -5050,7 +5078,61 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(folio, vma, addr); + return mpol_misplaced(folio, vmf, addr); +} + +static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long fault_addr, pte_t *fault_pte, + bool writable) +{ + pte_t pte, old_pte; + + old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (writable) + pte = pte_mkwrite(pte, vma); + ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); + update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, + struct folio *folio, pte_t fault_pte, + bool ignore_writable, bool pte_write_upgrade) +{ + int nr = pte_pfn(fault_pte) - folio_pfn(folio); + unsigned long start, end, addr = vmf->address; + unsigned long addr_start = addr - (nr << PAGE_SHIFT); + unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); + pte_t *start_ptep; + + /* Stay within the VMA and within the page table. */ + start = max3(addr_start, pt_start, vma->vm_start); + end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, + vma->vm_end); + start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); + + /* Restore all PTEs' mapping of the large folio */ + for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { + pte_t ptent = ptep_get(start_ptep); + bool writable = false; + + if (!pte_present(ptent) || !pte_protnone(ptent)) + continue; + + if (pfn_folio(pte_pfn(ptent)) != folio) + continue; + + if (!ignore_writable) { + ptent = pte_modify(ptent, vma->vm_page_prot); + writable = pte_write(ptent); + if (!writable && pte_write_upgrade && + can_change_pte_writable(vma, addr, ptent)) + writable = true; + } + + numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); + } } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -5058,11 +5140,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; - bool writable = false; + bool writable = false, ignore_writable = false; + bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; - int flags = 0; + int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page @@ -5084,7 +5167,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * is only valid while holding the PT lock. */ writable = pte_write(pte); - if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; @@ -5092,10 +5175,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* TODO: handle PTE-mapped THP */ - if (folio_test_large(folio)) - goto out_map; - /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -5111,10 +5190,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; nid = folio_nid(folio); + nr_pages = folio_nr_pages(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. @@ -5124,13 +5204,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); + target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; + ignore_writable = true; /* Migrate to the requested node */ if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5151,20 +5232,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, 1, flags); + task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (writable) - pte = pte_mkwrite(pte, vma); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + if (folio && folio_test_large(folio)) + numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, + pte_write_upgrade); + else + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5375,7 +5455,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5409,7 +5490,8 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5763,15 +5845,6 @@ retry: if (!vma_start_read(vma)) goto inval; - /* - * find_mergeable_anon_vma uses adjacent vmas which are not locked. - * This check must happen after vma_start_read(); otherwise, a - * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA - * from its anon_vma. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) - goto inval_end_read; - /* Check since vm_start/vm_end might change before we lock the VMA */ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; @@ -5869,34 +5942,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) /** * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping * @address: user virtual address * @ptepp: location to store found PTE * @ptlp: location to store the lock for the PTE * * On a successful return, the pointer to the PTE is stored in @ptepp; * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * + * The contents of the PTE are only stable until @ptlp is released using + * pte_unmap_unlock(). This function will fail if the PTE is non-present. + * Present PTEs may include PTEs that map refcounted pages, such as + * anonymous folios in COW mappings. + * + * Callers must be careful when relying on PTE content after + * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, + * callers must protect against invalidation with MMU notifiers; otherwise + * access to the PFN at a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read. * - * KVM uses this function. While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content. * * Return: zero on success, -ve otherwise. */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; + mmap_assert_locked(mm); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto out; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; @@ -5926,71 +6013,7 @@ out: } EXPORT_SYMBOL_GPL(follow_pte); -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE. Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - int ret = -EINVAL; - spinlock_t *ptl; - pte_t *ptep; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return ret; - - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); - if (ret) - return ret; - *pfn = pte_pfn(ptep_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(follow_pfn); - #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned long *prot, resource_size_t *phys) -{ - int ret = -EINVAL; - pte_t *ptep, pte; - spinlock_t *ptl; - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - goto out; - - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) - goto out; - pte = ptep_get(ptep); - - /* Never return PFNs of anon folios in COW mappings. */ - if (vm_normal_folio(vma, address, pte)) - goto unlock; - - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - *prot = pgprot_val(pte_pgprot(pte)); - *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - ret = 0; -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return ret; -} - /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access @@ -6014,11 +6037,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, int offset = offset_in_page(addr); int ret = -EINVAL; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - retry: - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) return -EINVAL; pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); @@ -6033,7 +6053,7 @@ retry: if (!maddr) return -ENOMEM; - if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + if (follow_pte(vma, addr, &ptep, &ptl)) goto out_unmap; if (!pte_same(pte, ptep_get(ptep))) { @@ -6191,21 +6211,14 @@ void print_vma_addr(char *prefix, unsigned long ip) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, ip); + vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_NOWAIT); - if (buf) { - char *p; - - p = file_path(f, buf, PAGE_SIZE); - if (IS_ERR(p)) - p = "?"; - printk("%s%s[%lx+%lx]", prefix, kbasename(p), - vma->vm_start, - vma->vm_end - vma->vm_start); - free_page((unsigned long)buf); - } + ip -= vma->vm_start; + ip += vma->vm_pgoff << PAGE_SHIFT; + printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, + vma->vm_start, + vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } @@ -6441,3 +6454,15 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + hugetlb_vma_unlock_read(vma); +} |