diff options
Diffstat (limited to 'fs/btrfs/ordered-data.c')
-rw-r--r-- | fs/btrfs/ordered-data.c | 45 |
1 files changed, 39 insertions, 6 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 81f67ebf74..35a413ce93 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,7 +19,6 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" -#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -295,6 +294,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&inode->ordered_tree_lock); } +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) +{ + if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + mapping_set_error(ordered->inode->i_mapping, -EIO); +} + static void finish_ordered_fn(struct btrfs_work *work) { struct btrfs_ordered_extent *ordered_extent; @@ -333,7 +338,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", - inode->root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(inode->root), btrfs_ino(inode), ordered->file_offset, ordered->num_bytes, len, ordered->bytes_left); ordered->bytes_left = 0; @@ -383,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); + if (ret) btrfs_queue_ordered_fn(ordered); return ret; @@ -1237,10 +1273,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( int __init ordered_data_init(void) { - btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", - sizeof(struct btrfs_ordered_extent), 0, - SLAB_MEM_SPREAD, - NULL); + btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); if (!btrfs_ordered_extent_cache) return -ENOMEM; |