diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 122 |
1 files changed, 75 insertions, 47 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0fe77738d9..a1bf9aa15c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -509,8 +509,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) qp->nr_failed++; return; } - folio = pfn_folio(pmd_pfn(*pmd)); - if (is_huge_zero_page(&folio->page)) { + folio = pmd_folio(*pmd); + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } @@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || - (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: @@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * - * To check if the folio is shared, ideally we want to make sure - * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated sharers of the folio instead. + * See folio_likely_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, @@ -1070,6 +1068,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; nodes_clear(nmask); @@ -1227,7 +1226,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); - return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) @@ -1504,9 +1504,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { - if (*mode != MPOL_BIND) + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + else return -EINVAL; - *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } @@ -2200,9 +2201,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, nodemask); + page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2217,7 +2218,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, +struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2248,7 +2249,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node(nid, + page = __alloc_pages_node_noprof(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; @@ -2261,7 +2262,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, } } - page = __alloc_pages(gfp, order, nid, nodemask); + page = __alloc_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ @@ -2292,7 +2293,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * * Return: The folio on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; @@ -2300,12 +2301,12 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); - page = alloc_pages_mpol(gfp | __GFP_COMP, order, - pol, ilx, numa_node_id()); + page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } -EXPORT_SYMBOL(vma_alloc_folio); +EXPORT_SYMBOL(vma_alloc_folio_noprof); /** * alloc_pages - Allocate pages. @@ -2321,7 +2322,7 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned int order) +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; @@ -2332,16 +2333,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order) if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); - return alloc_pages_mpol(gfp, order, - pol, NO_INTERLEAVE_INDEX, numa_node_id()); + return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); } -EXPORT_SYMBOL(alloc_pages); +EXPORT_SYMBOL(alloc_pages_noprof); -struct folio *folio_alloc(gfp_t gfp, unsigned int order) +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); + return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } -EXPORT_SYMBOL(folio_alloc); +EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, @@ -2360,13 +2361,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, for (i = 0; i < nodes; i++) { if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } @@ -2503,11 +2504,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) - nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; @@ -2519,7 +2520,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2543,8 +2544,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); - return __alloc_pages_bulk(gfp, nid, nodemask, - nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2718,7 +2719,7 @@ static void sp_free(struct sp_node *n) * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked - * @vma: vm area where folio mapped + * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's @@ -2728,18 +2729,24 @@ static void sp_free(struct sp_node *n) * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); + struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); - int thisnid = cpu_to_node(thiscpu); + int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; + /* + * Make sure ptl is held so that we don't preempt and we + * have a stable smp processor id + */ + lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; @@ -2764,15 +2771,26 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, break; case MPOL_BIND: - /* Optimize placement among multiple nodes via NUMA balancing */ + case MPOL_PREFERRED_MANY: + /* + * Even though MPOL_PREFERRED_MANY can allocate pages outside + * policy nodemask we don't allow numa migration to nodes + * outside policy nodemask for now. This is done so that if we + * want demotion to slow memory to happen, before allocating + * from some DRAM node say 'x', we will end up using a + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario + * we should not promote to node 'x' from slow memory node. + */ if (pol->flags & MPOL_F_MORON) { + /* + * Optimize placement among multiple nodes + * via NUMA balancing + */ if (node_isset(thisnid, pol->nodes)) break; goto out; } - fallthrough; - case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. @@ -2781,7 +2799,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), + node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); @@ -3275,8 +3293,9 @@ out: * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. - * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the - * longest flag, "relative", and to display at least a few node ids. + * Recommend a @maxlen of at least 51 for the longest mode, "weighted + * interleave", plus the longest flag flags, "relative|balancing", and to + * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { @@ -3285,7 +3304,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && + pol != &default_policy && + !(pol >= &preferred_node_policy[0] && + pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } @@ -3313,12 +3335,18 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += snprintf(p, buffer + maxlen - p, "="); /* - * Currently, the only defined flags are mutually exclusive + * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); + + if (flags & MPOL_F_NUMA_BALANCING) { + if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) + p += snprintf(p, buffer + maxlen - p, "|"); + p += snprintf(p, buffer + maxlen - p, "balancing"); + } } if (!nodes_empty(nodes)) |