diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 217 |
1 files changed, 127 insertions, 90 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1e02062074..958155cc43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -396,15 +396,14 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, cached_state); + + unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, - &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } - free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -413,9 +412,10 @@ out_failed: void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, + struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops); @@ -667,6 +667,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* + * Populate every free slot in a provided array with folios. + * + * @nr_folios: number of folios to allocate + * @folio_array: the array to fill with folios; any existing non-NULL entries in + * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation + * + * Return: 0 if all folios were able to be allocated; + * -ENOMEM otherwise, the partially allocated folios would be freed and + * the array slots zeroed + */ +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array, + gfp_t extra_gfp) +{ + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + continue; + folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0); + if (!folio_array[i]) + goto error; + } + return 0; +error: + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + folio_put(folio_array[i]); + } + return -ENOMEM; +} + +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -1571,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not - * able to find the pages tagged with SetPageError at transaction + * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage @@ -2246,8 +2277,7 @@ next_page: submit_write_bio(&bio_ctrl, found_error ? ret : 0); } -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; int ret = 0; @@ -2267,7 +2297,7 @@ int extent_writepages(struct address_space *mapping, return ret; } -void extent_readahead(struct readahead_control *rac) +void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct page *pagepool[16]; @@ -2325,19 +2355,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_io_tree *tree, +static bool try_release_extent_state(struct extent_io_tree *tree, struct page *page, gfp_t mask) { u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - int ret = 1; + bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = 0; + ret = false; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED); + int ret2; /* * At this point we can safely clear everything except the @@ -2345,15 +2376,15 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. */ - if (ret < 0) - ret = 0; + if (ret2 < 0) + ret = false; else - ret = 1; + ret = true; } return ret; } @@ -2363,84 +2394,80 @@ static int try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct page *page, gfp_t mask) { - struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = page_to_inode(page); - struct extent_io_tree *tree = &btrfs_inode->io_tree; - struct extent_map_tree *map = &btrfs_inode->extent_tree; - - if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > SZ_16M) { - u64 len; - while (start <= end) { - struct btrfs_fs_info *fs_info; - u64 cur_gen; - - len = end - start + 1; - write_lock(&map->lock); - em = lookup_extent_mapping(map, start, len); - if (!em) { - write_unlock(&map->lock); - break; - } - if ((em->flags & EXTENT_FLAG_PINNED) || - em->start != start) { - write_unlock(&map->lock); - free_extent_map(em); - break; - } - if (test_range_bit_exists(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED)) - goto next; - /* - * If it's not in the list of modified extents, used - * by a fast fsync, we can remove it. If it's being - * logged we can safely remove it since fsync took an - * extra reference on the em. - */ - if (list_empty(&em->list) || - (em->flags & EXTENT_FLAG_LOGGING)) - goto remove_em; - /* - * If it's in the list of modified extents, remove it - * only if its generation is older then the current one, - * in which case we don't need it for a fast fsync. - * Otherwise don't remove it, we could be racing with an - * ongoing fast fsync that could miss the new extent. - */ - fs_info = btrfs_inode->root->fs_info; - spin_lock(&fs_info->trans_lock); - cur_gen = fs_info->generation; - spin_unlock(&fs_info->trans_lock); - if (em->generation >= cur_gen) - goto next; -remove_em: - /* - * We only remove extent maps that are not in the list of - * modified extents or that are in the list but with a - * generation lower then the current generation, so there - * is no need to set the full fsync flag on the inode (it - * hurts the fsync performance for workloads with a data - * size that exceeds or is close to the system's memory). - */ - remove_extent_mapping(map, em); - /* once for the rb tree */ + struct btrfs_inode *inode = page_to_inode(page); + struct extent_io_tree *io_tree = &inode->io_tree; + + while (start <= end) { + const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); + const u64 len = end - start + 1; + struct extent_map_tree *extent_tree = &inode->extent_tree; + struct extent_map *em; + + write_lock(&extent_tree->lock); + em = lookup_extent_mapping(extent_tree, start, len); + if (!em) { + write_unlock(&extent_tree->lock); + break; + } + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { + write_unlock(&extent_tree->lock); free_extent_map(em); + break; + } + if (test_range_bit_exists(io_tree, em->start, + extent_map_end(em) - 1, EXTENT_LOCKED)) + goto next; + /* + * If it's not in the list of modified extents, used by a fast + * fsync, we can remove it. If it's being logged we can safely + * remove it since fsync took an extra reference on the em. + */ + if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it only if + * its generation is older then the current one, in which case + * we don't need it for a fast fsync. Otherwise don't remove it, + * we could be racing with an ongoing fast fsync that could miss + * the new extent. + */ + if (em->generation >= cur_gen) + goto next; +remove_em: + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there is no + * need to set the full fsync flag on the inode (it hurts the + * fsync performance for workloads with a data size that exceeds + * or is close to the system's memory). + */ + remove_extent_mapping(inode, em); + /* Once for the inode's extent map tree. */ + free_extent_map(em); next: - start = extent_map_end(em); - write_unlock(&map->lock); + start = extent_map_end(em); + write_unlock(&extent_tree->lock); - /* once for us */ - free_extent_map(em); + /* Once for us, for the lookup_extent_mapping() reference. */ + free_extent_map(em); + + if (need_resched()) { + /* + * If we need to resched but we can't block just exit + * and leave any remaining extent maps. + */ + if (!gfpflags_allow_blocking(mask)) + break; - cond_resched(); /* Allow large-extent preemption. */ + cond_resched(); } } - return try_release_extent_state(tree, page, mask); + return try_release_extent_state(io_tree, page, mask); } struct btrfs_fiemap_entry { @@ -4269,6 +4296,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static void clear_extent_buffer_reading(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); +} + static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; @@ -4277,6 +4311,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) struct folio_iter fi; u32 bio_offset = 0; + /* + * If the extent buffer is marked UPTODATE before the read operation + * completes, other calls to read_extent_buffer_pages() will return + * early without waiting for the read to finish, causing data races. + */ + WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); + eb->read_mirror = bbio->mirror_num; if (uptodate && @@ -4303,9 +4344,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_offset += len; } - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); free_extent_buffer(eb); bio_put(&bbio->bio); @@ -4339,9 +4378,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, * will now be set, and we shouldn't read it in again. */ if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); return 0; } |