From b20732900e4636a467c0183a47f7396700f5f743 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:11:22 +0200 Subject: Adding upstream version 6.9.7. Signed-off-by: Daniel Baumann --- fs/xfs/xfs_buf.c | 330 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 205 insertions(+), 125 deletions(-) (limited to 'fs/xfs/xfs_buf.c') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e5bd50d29..f0fa02264e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -21,6 +21,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_buf_mem.h" struct kmem_cache *xfs_buf_cache; @@ -60,6 +61,11 @@ xfs_buf_submit( return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); } +static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) +{ + return bp->b_rhash_key == XFS_BUF_DADDR_NULL; +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -189,8 +195,8 @@ xfs_buf_get_maps( return 0; } - bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), - KM_NOFS); + bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!bp->b_maps) return -ENOMEM; return 0; @@ -204,7 +210,7 @@ xfs_buf_free_maps( struct xfs_buf *bp) { if (bp->b_maps != &bp->__b_map) { - kmem_free(bp->b_maps); + kfree(bp->b_maps); bp->b_maps = NULL; } } @@ -222,7 +228,8 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); + bp = kmem_cache_zalloc(xfs_buf_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -289,7 +296,7 @@ xfs_buf_free_pages( mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) - kmem_free(bp->b_pages); + kfree(bp->b_pages); bp->b_pages = NULL; bp->b_flags &= ~_XBF_PAGES; } @@ -312,10 +319,12 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) + if (xfs_buftarg_is_mem(bp->b_target)) + xmbuf_unmap_page(bp); + else if (bp->b_flags & _XBF_PAGES) xfs_buf_free_pages(bp); else if (bp->b_flags & _XBF_KMEM) - kmem_free(bp->b_addr); + kfree(bp->b_addr); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } @@ -325,21 +334,21 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - xfs_km_flags_t kmflag_mask = KM_NOFS; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; size_t size = BBTOB(bp->b_length); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) - kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; - bp->b_addr = kmem_alloc(size, kmflag_mask); + bp->b_addr = kmalloc(size, gfp_mask); if (!bp->b_addr) return -ENOMEM; if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ - kmem_free(bp->b_addr); + kfree(bp->b_addr); bp->b_addr = NULL; return -ENOMEM; } @@ -356,13 +365,11 @@ xfs_buf_alloc_pages( struct xfs_buf *bp, xfs_buf_flags_t flags) { - gfp_t gfp_mask = __GFP_NOWARN; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; long filled = 0; if (flags & XBF_READ_AHEAD) gfp_mask |= __GFP_NORETRY; - else - gfp_mask |= GFP_NOFS; /* Make sure that we have a page list */ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); @@ -429,11 +436,18 @@ _xfs_buf_map_pages( /* * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we are likely to be under - * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOFS to prevent - * memory reclaim re-entering the filesystem here and - * potentially deadlocking. + * pagetables) with GFP_KERNEL, yet we often under a scoped nofs + * context here. Mixing GFP_KERNEL with GFP_NOFS allocations + * from the same call site that can be run from both above and + * below memory reclaim causes lockdep false positives. Hence we + * always need to force this allocation to nofs context because + * we can't pass __GFP_NOLOCKDEP down to auxillary structures to + * prevent false positive lockdep reports. + * + * XXX(dgc): I think dquot reclaim is the only place we can get + * to this function from memory reclaim context now. If we fix + * that like we've fixed inode reclaim to avoid writeback from + * reclaim, this nofs wrapping can go away. */ nofs_flag = memalloc_nofs_save(); do { @@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = { }; int -xfs_buf_hash_init( - struct xfs_perag *pag) +xfs_buf_cache_init( + struct xfs_buf_cache *bch) { - spin_lock_init(&pag->pag_buf_lock); - return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); + spin_lock_init(&bch->bc_lock); + return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } void -xfs_buf_hash_destroy( - struct xfs_perag *pag) +xfs_buf_cache_destroy( + struct xfs_buf_cache *bch) { - rhashtable_destroy(&pag->pag_buf_hash); + rhashtable_destroy(&bch->bc_hash); } static int @@ -573,7 +587,7 @@ xfs_buf_find_lock( static inline int xfs_buf_lookup( - struct xfs_perag *pag, + struct xfs_buf_cache *bch, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) @@ -582,7 +596,7 @@ xfs_buf_lookup( int error; rcu_read_lock(); - bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { rcu_read_unlock(); return -ENOENT; @@ -607,6 +621,7 @@ xfs_buf_lookup( static int xfs_buf_find_insert( struct xfs_buftarg *btp, + struct xfs_buf_cache *bch, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, @@ -622,31 +637,33 @@ xfs_buf_find_insert( if (error) goto out_drop_pag; - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { + if (xfs_buftarg_is_mem(new_bp->b_target)) { + error = xmbuf_map_page(new_bp); + } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + /* + * For buffers that fit entirely within a single page, first + * attempt to allocate the memory from the heap to minimise + * memory usage. If we can't get heap memory for these small + * buffers, we fall back to using the page allocator. + */ error = xfs_buf_alloc_pages(new_bp, flags); - if (error) - goto out_free_buf; } + if (error) + goto out_free_buf; - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + spin_lock(&bch->bc_lock); + bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { error = PTR_ERR(bp); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp) { /* found an existing buffer */ atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -657,17 +674,40 @@ xfs_buf_find_insert( /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; out_free_buf: xfs_buf_free(new_bp); out_drop_pag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } +static inline struct xfs_perag * +xfs_buftarg_get_pag( + struct xfs_buftarg *btp, + const struct xfs_buf_map *map) +{ + struct xfs_mount *mp = btp->bt_mount; + + if (xfs_buftarg_is_mem(btp)) + return NULL; + return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); +} + +static inline struct xfs_buf_cache * +xfs_buftarg_buf_cache( + struct xfs_buftarg *btp, + struct xfs_perag *pag) +{ + if (pag) + return &pag->pag_bcache; + return btp->bt_cache; +} + /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -681,6 +721,7 @@ xfs_buf_get_map( xfs_buf_flags_t flags, struct xfs_buf **bpp) { + struct xfs_buf_cache *bch; struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; @@ -696,10 +737,10 @@ xfs_buf_get_map( if (error) return error; - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + pag = xfs_buftarg_get_pag(btp, &cmap); + bch = xfs_buftarg_buf_cache(btp, pag); - error = xfs_buf_lookup(pag, &cmap, flags, &bp); + error = xfs_buf_lookup(bch, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; @@ -711,13 +752,14 @@ xfs_buf_get_map( goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ - error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; } else { XFS_STATS_INC(btp->bt_mount, xb_get_locked); - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); } /* We do not hold a perag reference anymore. */ @@ -745,7 +787,8 @@ xfs_buf_get_map( return 0; out_put_perag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } @@ -892,6 +935,13 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; + /* + * Currently we don't have a good means or justification for performing + * xmbuf_map_page asynchronously, so we don't do readahead. + */ + if (xfs_buftarg_is_mem(target)) + return; + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -957,7 +1007,10 @@ xfs_buf_get_uncached( if (error) return error; - error = xfs_buf_alloc_pages(bp, flags); + if (xfs_buftarg_is_mem(bp->b_target)) + error = xmbuf_map_page(bp); + else + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; @@ -990,29 +1043,29 @@ xfs_buf_hold( atomic_inc(&bp->b_hold); } -/* - * Release a hold on the specified buffer. If the hold count is 1, the buffer is - * placed on LRU or freed (depending on b_lru_ref). - */ -void -xfs_buf_rele( +static void +xfs_buf_rele_uncached( + struct xfs_buf *bp) +{ + ASSERT(list_empty(&bp->b_lru)); + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); + xfs_buf_free(bp); + } +} + +static void +xfs_buf_rele_cached( struct xfs_buf *bp) { + struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; + struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); bool release; bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); - if (!pag) { - ASSERT(list_empty(&bp->b_lru)); - if (atomic_dec_and_test(&bp->b_hold)) { - xfs_buf_ioacct_dec(bp); - xfs_buf_free(bp); - } - return; - } - ASSERT(atomic_read(&bp->b_hold) > 0); /* @@ -1026,7 +1079,7 @@ xfs_buf_rele( * leading to a use-after-free scenario. */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1047,11 +1100,11 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); } else { /* * most of the time buffers will already be removed from the @@ -1060,16 +1113,17 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&btp->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&bch->bc_lock); + if (pag) + xfs_perag_put(pag); freebuf = true; } @@ -1080,6 +1134,19 @@ out_unlock: xfs_buf_free(bp); } +/* + * Release a hold on the specified buffer. + */ +void +xfs_buf_rele( + struct xfs_buf *bp) +{ + trace_xfs_buf_rele(bp, _RET_IP_); + if (xfs_buf_is_uncached(bp)) + xfs_buf_rele_uncached(bp); + else + xfs_buf_rele_cached(bp); +} /* * Lock a buffer object, if it is not already locked. @@ -1585,6 +1652,12 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ op |= REQ_META; + /* in-memory targets are directly mapped, no IO required. */ + if (xfs_buftarg_is_mem(bp->b_target)) { + xfs_buf_ioend(bp); + return; + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - @@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count( } void -xfs_free_buftarg( +xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); +} +void +xfs_free_buftarg( + struct xfs_buftarg *btp) +{ + xfs_destroy_buftarg(btp); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_release(btp->bt_bdev_handle); - - kmem_free(btp); + bdev_fput(btp->bt_bdev_file); + kfree(btp); } int xfs_setsize_buftarg( - xfs_buftarg_t *btp, + struct xfs_buftarg *btp, unsigned int sectorsize) { /* Set up metadata sector size info */ @@ -1972,80 +2050,82 @@ xfs_setsize_buftarg( return -EINVAL; } - /* Set up device logical sector size mask */ - btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); - btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; - return 0; } -/* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. - */ -STATIC int -xfs_setsize_buftarg_early( - xfs_buftarg_t *btp) +int +xfs_init_buftarg( + struct xfs_buftarg *btp, + size_t logical_sectorsize, + const char *descr) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = logical_sectorsize; + btp->bt_logical_sectormask = logical_sectorsize - 1; + + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + + if (list_lru_init(&btp->bt_lru)) + return -ENOMEM; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto out_destroy_lru; + + btp->bt_shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); + if (!btp->bt_shrinker) + goto out_destroy_io_count; + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + shrinker_register(btp->bt_shrinker); + return 0; + +out_destroy_io_count: + percpu_counter_destroy(&btp->bt_io_count); +out_destroy_lru: + list_lru_destroy(&btp->bt_lru); + return -ENOMEM; } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct bdev_handle *bdev_handle) + struct file *bdev_file) { - xfs_buftarg_t *btp; + struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; #endif - btp = kmem_zalloc(sizeof(*btp), KM_NOFS); + btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; - btp->bt_bdev_handle = bdev_handle; - btp->bt_dev = bdev_handle->bdev->bd_dev; - btp->bt_bdev = bdev_handle->bdev; + btp->bt_bdev_file = bdev_file; + btp->bt_bdev = file_bdev(bdev_file); + btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* - * Buffer IO error rate limiting. Limit it to no more than 10 messages - * per 30 seconds so as to not spam logs too much on repeated errors. + * When allocating the buftargs we have not yet read the super block and + * thus don't know the file system sector size yet. */ - ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, - DEFAULT_RATELIMIT_BURST); - - if (xfs_setsize_buftarg_early(btp)) + if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) goto error_free; - - if (list_lru_init(&btp->bt_lru)) + if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), + mp->m_super->s_id)) goto error_free; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error_lru; - - btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", - mp->m_super->s_id); - if (!btp->bt_shrinker) - goto error_pcpu; - - btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker->private_data = btp; - - shrinker_register(btp->bt_shrinker); - return btp; -error_pcpu: - percpu_counter_destroy(&btp->bt_io_count); -error_lru: - list_lru_destroy(&btp->bt_lru); error_free: - kmem_free(btp); + kfree(btp); return NULL; } -- cgit v1.2.3