summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ordered-data.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/btrfs/ordered-data.c31
1 files changed, 31 insertions, 0 deletions
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c2a42bcde9..7dbf4162c7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -382,6 +382,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;