diff options
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 331 |
1 files changed, 254 insertions, 77 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97f6133b6e..d90138683a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -128,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int err = 0; + int ret = 0; int i; u64 num_bytes; u64 start_pos; @@ -158,10 +158,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); - if (err) - return err; + if (ret) + return ret; for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -206,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -246,7 +245,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -373,15 +372,17 @@ next_slot: btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - new_key.objectid, - args->start - extent_offset, - 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, new_key.objectid, + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -464,15 +465,17 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key.objectid, - key.offset - extent_offset, 0, - false); + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key.objectid, + key.offset - extent_offset, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -748,10 +751,13 @@ again: extent_end - split); btrfs_mark_buffer_dirty(trans, leaf); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset, 0, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -774,10 +780,14 @@ again: other_start = end; other_end = 0; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, - 0, false); + + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -915,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = get_prepare_fgp_flags(nowait); - int err = 0; + int ret = 0; int faili; for (i = 0; i < num_pages; i++) { @@ -925,28 +935,28 @@ again: if (!pages[i]) { faili = i - 1; if (nowait) - err = -EAGAIN; + ret = -EAGAIN; else - err = -ENOMEM; + ret = -ENOMEM; goto fail; } - err = set_page_extent_mapped(pages[i]); - if (err < 0) { + ret = set_page_extent_mapped(pages[i]); + if (ret < 0) { faili = i; goto fail; } if (i == 0) - err = prepare_uptodate_page(inode, pages[i], pos, + ret = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); - if (!err && i == num_pages - 1) - err = prepare_uptodate_page(inode, pages[i], + if (!ret && i == num_pages - 1) + ret = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); - if (err) { + if (ret) { put_page(pages[i]); - if (!nowait && err == -EAGAIN) { - err = 0; + if (!nowait && ret == -EAGAIN) { + ret = 0; goto again; } faili = i - 1; @@ -962,7 +972,7 @@ fail: put_page(pages[faili]); faili--; } - return err; + return ret; } @@ -1465,7 +1475,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t written_buffered; size_t prev_left = 0; loff_t endbyte; - ssize_t err; + ssize_t ret; unsigned int ilock_flags = 0; struct iomap_dio *dio; @@ -1482,9 +1492,9 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (err < 0) - return err; + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; /* Shared lock cannot be used with security bits set. */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { @@ -1493,14 +1503,14 @@ relock: goto relock; } - err = generic_write_checks(iocb, from); - if (err <= 0) { + ret = generic_write_checks(iocb, from); + if (ret <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return err; + return ret; } - err = btrfs_write_check(iocb, from, err); - if (err < 0) { + ret = btrfs_write_check(iocb, from, ret); + if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1552,15 +1562,15 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); if (IS_ERR_OR_NULL(dio)) - err = PTR_ERR_OR_ZERO(dio); + ret = PTR_ERR_OR_ZERO(dio); else - err = iomap_dio_complete(dio); + ret = iomap_dio_complete(dio); /* No increment (+=) because iomap returns a cumulative value. */ - if (err > 0) - written = err; + if (ret > 0) + written = ret; - if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(from); /* * We have more data left to write. Try to fault in as many as @@ -1577,7 +1587,7 @@ relock: * to buffered IO in case we haven't made any progress. */ if (left == prev_left) { - err = -ENOTBLK; + ret = -ENOTBLK; } else { fault_in_iov_iter_readable(from, left); prev_left = left; @@ -1586,10 +1596,10 @@ relock: } /* - * If 'err' is -ENOTBLK or we have not written all data, then it means + * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. */ - if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) + if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: @@ -1600,14 +1610,14 @@ buffered: * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { - err = written_buffered; + ret = written_buffered; goto out; } /* @@ -1615,18 +1625,18 @@ buffered: * able to read what was just written. */ endbyte = pos + written_buffered - 1; - err = btrfs_fdatawrite_range(inode, pos, endbyte); - if (err) + ret = btrfs_fdatawrite_range(inode, pos, endbyte); + if (ret) goto out; - err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); - if (err) + ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (ret) goto out; written += written_buffered; iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: - return err < 0 ? err : written; + return ret < 0 ? ret : written; } static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -2045,6 +2055,172 @@ out_release_extents: goto out; } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct folio *folio = page_folio(page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + vm_fault_t ret; + int ret2; + int reserved = 0; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + ASSERT(folio_order(folio) == 0); + + reserved_space = PAGE_SIZE; + + sb_start_pagefault(inode->i_sb); + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); + reserved = 1; + } + if (ret2) { + ret = vmf_error(ret2); + if (reserved) + goto out; + goto out_noreserve; + } + + /* Make the VM retry the fault. */ + ret = VM_FAULT_NOPAGE; +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + lock_page(page); + size = i_size_read(inode); + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* Page got truncated out from underneath us. */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, &cached_state); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * We can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (page->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, fs_info->sectorsize); + if (reserved_space < PAGE_SIZE) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret2) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* Page is wholly or partially inside EOF. */ + if (page_start + PAGE_SIZE > size) + zero_start = offset_in_page(size); + else + zero_start = PAGE_SIZE; + + if (zero_start != PAGE_SIZE) + memzero_page(page, zero_start, PAGE_SIZE - zero_start); + + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, (ret != 0)); +out_noreserve: + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return ret; +} + static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -2274,7 +2450,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; int slot; - struct btrfs_ref ref = { 0 }; int ret; if (replace_len == 0) @@ -2330,15 +2505,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->qgroup_reserved, &key); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = extent_info->disk_offset, + .num_bytes = extent_info->disk_len, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; u64 ref_offset; - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - extent_info->disk_offset, - extent_info->disk_len, 0, - root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset, 0, false); + btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -3735,8 +3912,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_CAN_ODIRECT; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3866,6 +4042,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) |