diff options
Diffstat (limited to '')
-rw-r--r-- | fs/bcachefs/fs-io-pagecache.c | 791 |
1 files changed, 791 insertions, 0 deletions
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c new file mode 100644 index 0000000000..ff664fd0d8 --- /dev/null +++ b/fs/bcachefs/fs-io-pagecache.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "btree_iter.h" +#include "extents.h" +#include "fs-io.h" +#include "fs-io-pagecache.h" +#include "subvolume.h" + +#include <linux/pagevec.h> +#include <linux/writeback.h> + +int bch2_filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + fgf_t fgp_flags, gfp_t gfp, + folios *fs) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(fs->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(fs, f); + } + + if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return fs->nr ? 0 : ret; +} + +/* pagecache_block must be held */ +int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +#if 0 +/* Useful for debug tracing: */ +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; +#endif + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +/* for newly allocated folios: */ +struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + struct bch_folio *s; + + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); + if (!s) + return NULL; + + spin_lock_init(&s->lock); + folio_attach_private(folio, s); + return s; +} + +struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); +} + +static unsigned bkey_to_sector_state(struct bkey_s_c k) +{ + if (bkey_extent_is_reservation(k)) + return SECTOR_reserved; + if (bkey_extent_is_allocation(k.k)) + return SECTOR_allocated; + return SECTOR_unallocated; +} + +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + bch2_folio_sector_set(folio, s, i, state); + } + + if (i == sectors) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **fs, unsigned nr_folios) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_folio *s; + u64 offset = folio_sector(fs[0]); + unsigned folio_idx; + u32 snapshot; + bool need_set = false; + int ret; + + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } + + if (folio_idx == nr_folios) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_put(trans); + + return ret; +} + +void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct folio_vec fv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); +} + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + s64 i_sectors_delta = 0; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, j, + folio_sector_reserve(s->s[j].state)); + } + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +int bch2_get_folio_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, bool check_enospc) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < sectors; i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +void bch2_folio_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + BUG_ON(!s->uptodate); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; + + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } + + return 0; +} + +static void bch2_clear_folio_bits(struct folio *folio) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_folio *s = bch2_folio(folio); + struct disk_reservation disk_res = { 0 }; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; + + if (!s) + return; + + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); + + for (i = 0; i < sectors; i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + dirty_sectors -= s->s[i].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); + } + + bch2_disk_reservation_put(c, &disk_res); + + bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_folio_release(folio); +} + +void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, dirty_sectors = 0; + + WARN_ON((u64) folio_pos(folio) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + BUG_ON(!s->uptodate); + + spin_lock(&s->lock); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } + + spin_unlock(&s->lock); + + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); +} + +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(inode)) + goto got_lock; + + bch2_pagecache_block_put(fdm_host); + + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); + + bch2_pagecache_block_get(fdm_host); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + + bch2_pagecache_add_get(inode); +got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(inode); + + return ret; +} + +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; + vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + + sb_start_pagefault(inode->v.i_sb); + file_update_time(file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get + * a bch2_write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + bch2_pagecache_add_get(inode); + + folio_lock(folio); + isize = i_size_read(&inode->v); + + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); + ret = VM_FAULT_SIGBUS; + goto out; + } + + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); + + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + bch2_pagecache_add_put(inode); + sb_end_pagefault(inode->v.i_sb); + + return ret; +} + +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + if (offset || length < folio_size(folio)) + return; + + bch2_clear_folio_bits(folio); +} + +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; + + bch2_clear_folio_bits(folio); + return true; +} + +/* fseek: */ + +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; +} + +loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned i; + loff_t ret; + int offset; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (!nonblock) { + folio_lock(folio); + } else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + return -EAGAIN; + } + + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); + folio_unlock(folio); + folio_batch_release(&fbatch); + return ret; + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return end_offset; +} + +/* + * Search for a hole in a folio. + * + * The filemap layer returns -ENOENT if no folio exists, so reuse the same error + * code to indicate a pagecache hole exists at the returned offset. Otherwise + * return 0 if the folio is filled with data, or an error code. This function + * can return -EAGAIN if nonblock is specified. + */ +static int folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas, bool nonblock) +{ + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + int ret = -ENOENT; + + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, + FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + s = bch2_folio(folio); + if (!s) + goto unlock; + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + + *offset = folio_end_pos(folio); + ret = 0; +unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + loff_t ret = 0; + + while (!ret && offset < end_offset) + ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); + + if (ret && ret != -ENOENT) + return ret; + return min(offset, end_offset); +} + +int bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas, + bool nonblock) +{ + loff_t ret; + + ret = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_start = ret; + + if (*hole_start == *hole_end) + return 0; + + ret = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_end = ret; + return 0; +} + +#endif /* NO_BCACHEFS_FS */ |