summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/btrfs/extent_io.c217
1 files changed, 127 insertions, 90 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1e02062074..958155cc43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -396,15 +396,14 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, cached_state);
+
+ unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end,
- &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
- free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -413,9 +412,10 @@ out_failed:
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
+ struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops);
@@ -667,6 +667,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/*
+ * Populate every free slot in a provided array with folios.
+ *
+ * @nr_folios: number of folios to allocate
+ * @folio_array: the array to fill with folios; any existing non-NULL entries in
+ * the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation
+ *
+ * Return: 0 if all folios were able to be allocated;
+ * -ENOMEM otherwise, the partially allocated folios would be freed and
+ * the array slots zeroed
+ */
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+ gfp_t extra_gfp)
+{
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ continue;
+ folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ if (!folio_array[i])
+ goto error;
+ }
+ return 0;
+error:
+ for (int i = 0; i < nr_folios; i++) {
+ if (folio_array[i])
+ folio_put(folio_array[i]);
+ }
+ return -ENOMEM;
+}
+
+/*
* Populate every free slot in a provided array with pages.
*
* @nr_pages: number of pages to allocate
@@ -1571,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
- * able to find the pages tagged with SetPageError at transaction
+ * able to find the pages which contain errors at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
@@ -2246,8 +2277,7 @@ next_page:
submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}
-int extent_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int ret = 0;
@@ -2267,7 +2297,7 @@ int extent_writepages(struct address_space *mapping,
return ret;
}
-void extent_readahead(struct readahead_control *rac)
+void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct page *pagepool[16];
@@ -2325,19 +2355,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
-static int try_release_extent_state(struct extent_io_tree *tree,
+static bool try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- int ret = 1;
+ bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = 0;
+ ret = false;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
EXTENT_QGROUP_RESERVED);
+ int ret2;
/*
* At this point we can safely clear everything except the
@@ -2345,15 +2376,15 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
- if (ret < 0)
- ret = 0;
+ if (ret2 < 0)
+ ret = false;
else
- ret = 1;
+ ret = true;
}
return ret;
}
@@ -2363,84 +2394,80 @@ static int try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-int try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct page *page, gfp_t mask)
{
- struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *btrfs_inode = page_to_inode(page);
- struct extent_io_tree *tree = &btrfs_inode->io_tree;
- struct extent_map_tree *map = &btrfs_inode->extent_tree;
-
- if (gfpflags_allow_blocking(mask) &&
- page->mapping->host->i_size > SZ_16M) {
- u64 len;
- while (start <= end) {
- struct btrfs_fs_info *fs_info;
- u64 cur_gen;
-
- len = end - start + 1;
- write_lock(&map->lock);
- em = lookup_extent_mapping(map, start, len);
- if (!em) {
- write_unlock(&map->lock);
- break;
- }
- if ((em->flags & EXTENT_FLAG_PINNED) ||
- em->start != start) {
- write_unlock(&map->lock);
- free_extent_map(em);
- break;
- }
- if (test_range_bit_exists(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED))
- goto next;
- /*
- * If it's not in the list of modified extents, used
- * by a fast fsync, we can remove it. If it's being
- * logged we can safely remove it since fsync took an
- * extra reference on the em.
- */
- if (list_empty(&em->list) ||
- (em->flags & EXTENT_FLAG_LOGGING))
- goto remove_em;
- /*
- * If it's in the list of modified extents, remove it
- * only if its generation is older then the current one,
- * in which case we don't need it for a fast fsync.
- * Otherwise don't remove it, we could be racing with an
- * ongoing fast fsync that could miss the new extent.
- */
- fs_info = btrfs_inode->root->fs_info;
- spin_lock(&fs_info->trans_lock);
- cur_gen = fs_info->generation;
- spin_unlock(&fs_info->trans_lock);
- if (em->generation >= cur_gen)
- goto next;
-remove_em:
- /*
- * We only remove extent maps that are not in the list of
- * modified extents or that are in the list but with a
- * generation lower then the current generation, so there
- * is no need to set the full fsync flag on the inode (it
- * hurts the fsync performance for workloads with a data
- * size that exceeds or is close to the system's memory).
- */
- remove_extent_mapping(map, em);
- /* once for the rb tree */
+ struct btrfs_inode *inode = page_to_inode(page);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+
+ while (start <= end) {
+ const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ const u64 len = end - start + 1;
+ struct extent_map_tree *extent_tree = &inode->extent_tree;
+ struct extent_map *em;
+
+ write_lock(&extent_tree->lock);
+ em = lookup_extent_mapping(extent_tree, start, len);
+ if (!em) {
+ write_unlock(&extent_tree->lock);
+ break;
+ }
+ if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
+ write_unlock(&extent_tree->lock);
free_extent_map(em);
+ break;
+ }
+ if (test_range_bit_exists(io_tree, em->start,
+ extent_map_end(em) - 1, EXTENT_LOCKED))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used by a fast
+ * fsync, we can remove it. If it's being logged we can safely
+ * remove it since fsync took an extra reference on the em.
+ */
+ if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it only if
+ * its generation is older then the current one, in which case
+ * we don't need it for a fast fsync. Otherwise don't remove it,
+ * we could be racing with an ongoing fast fsync that could miss
+ * the new extent.
+ */
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there is no
+ * need to set the full fsync flag on the inode (it hurts the
+ * fsync performance for workloads with a data size that exceeds
+ * or is close to the system's memory).
+ */
+ remove_extent_mapping(inode, em);
+ /* Once for the inode's extent map tree. */
+ free_extent_map(em);
next:
- start = extent_map_end(em);
- write_unlock(&map->lock);
+ start = extent_map_end(em);
+ write_unlock(&extent_tree->lock);
- /* once for us */
- free_extent_map(em);
+ /* Once for us, for the lookup_extent_mapping() reference. */
+ free_extent_map(em);
+
+ if (need_resched()) {
+ /*
+ * If we need to resched but we can't block just exit
+ * and leave any remaining extent maps.
+ */
+ if (!gfpflags_allow_blocking(mask))
+ break;
- cond_resched(); /* Allow large-extent preemption. */
+ cond_resched();
}
}
- return try_release_extent_state(tree, page, mask);
+ return try_release_extent_state(io_tree, page, mask);
}
struct btrfs_fiemap_entry {
@@ -4269,6 +4296,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
+static void clear_extent_buffer_reading(struct extent_buffer *eb)
+{
+ clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
+ smp_mb__after_atomic();
+ wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+}
+
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
@@ -4277,6 +4311,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
struct folio_iter fi;
u32 bio_offset = 0;
+ /*
+ * If the extent buffer is marked UPTODATE before the read operation
+ * completes, other calls to read_extent_buffer_pages() will return
+ * early without waiting for the read to finish, causing data races.
+ */
+ WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));
+
eb->read_mirror = bbio->mirror_num;
if (uptodate &&
@@ -4303,9 +4344,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
bio_offset += len;
}
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
@@ -4339,9 +4378,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
* will now be set, and we shouldn't read it in again.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_extent_buffer_reading(eb);
return 0;
}