diff options
Diffstat (limited to 'fs/bcachefs/fs.c')
-rw-r--r-- | fs/bcachefs/fs.c | 351 |
1 files changed, 232 insertions, 119 deletions
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542d..fa1fee05cf 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -90,7 +90,7 @@ retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT) ?: + BTREE_ITER_intent) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -108,7 +108,8 @@ retry: goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "inode %u:%llu not found when updating", + "%s: inode %u:%llu not found when updating", + bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); @@ -176,45 +177,130 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked inode_u; - struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; + subvol_inum inum = inode_inum(inode); + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + if (unlikely(old != inode)) { + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); + discard_new_inode(&inode->v); + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. + */ + unlock_new_inode(&inode->v); + } - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + return inode; +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + BUG(); +} + +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +{ + struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; + mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + inode->v.i_state = 0; + + if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + kmem_cache_free(bch2_inode_cache, inode); + return NULL; } - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + return inode; +} + +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_inode_info *inode = + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, + __bch2_new_inode(trans->c)); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); + if (ret && inode) { + __destroy_inode(&inode->v); + kmem_cache_free(bch2_inode_cache, inode); + } + if (ret) + return ERR_PTR(ret); + } + + return inode; +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + return &inode->v; + + struct btree_trans *trans = bch2_trans_get(c); - unlock_new_inode(&inode->v); + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (!ret) { + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); + } + bch2_trans_put(trans); - return &inode->v; + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,7 +312,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; @@ -243,7 +329,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = to_bch_ei(new_inode(c->vfs_sb)); + inode = __bch2_new_inode(c); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -276,7 +362,7 @@ retry: inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_WITH_UPDATES, &subvol) ?: + BTREE_ITER_with_updates, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -293,7 +379,6 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -304,36 +389,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -352,23 +408,86 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + struct printbuf buf = PRINTBUF; + + struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + c, "dirent to missing inode:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + /* regular files may have hardlinks: */ + if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, &inode_u), + buf.buf))) { + ret = -ENOENT; + goto err; + } + + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); +out: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, @@ -715,7 +834,7 @@ retry: acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -772,6 +891,9 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; + stat->subvol = inode->ei_subvol; + stat->result_mask |= STATX_SUBVOL; + if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); @@ -892,7 +1014,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; u32 snapshot; @@ -902,6 +1023,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (ret) return ret; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; @@ -968,6 +1090,10 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); + + ret = bch2_trans_relock(trans); + if (ret) + break; } start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); @@ -1034,6 +1160,8 @@ static int bch2_open(struct inode *vinode, struct file *file) return ret; } + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(vinode, file); } @@ -1043,6 +1171,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1126,7 +1255,6 @@ static const struct address_space_operations bch_address_space_operations = { .write_end = bch2_write_end, .invalidate_folio = bch2_invalidate_folio, .release_folio = bch2_release_folio, - .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif @@ -1372,13 +1500,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1390,6 +1514,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1414,34 +1541,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, mapping_set_large_folios(inode->v.i_mapping); } -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - struct bch_inode_info *inode; - - inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - mutex_init(&inode->ei_quota_lock); - - return &inode->v; -} - -static void bch2_i_callback(struct rcu_head *head) +static void bch2_free_inode(struct inode *vinode) { - struct inode *vinode = container_of(head, struct inode, i_rcu); - struct bch_inode_info *inode = to_bch_ei(vinode); - - kmem_cache_free(bch2_inode_cache, inode); -} - -static void bch2_destroy_inode(struct inode *vinode) -{ - call_rcu(&vinode->i_rcu, bch2_i_callback); + kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, @@ -1572,7 +1674,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1583,10 +1684,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; @@ -1753,7 +1851,7 @@ static int bch2_unfreeze(struct super_block *sb) static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, - .destroy_inode = bch2_destroy_inode, + .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, @@ -1805,8 +1903,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) + if (ret) { + ret = bch2_err_class(ret); return ERR_PTR(ret); + } if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); @@ -1854,8 +1954,7 @@ got_sb: if (IS_ERR(sb)) { ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); + goto err; } c = sb->s_fs_info; @@ -1882,6 +1981,8 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + sb->s_uuid = c->sb.user_uuid; + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -1928,7 +2029,19 @@ out: return dget(sb->s_root); err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); +err: + if (ret) + pr_err("error: %s", bch2_err_str(ret)); + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; return ERR_PTR(bch2_err_class(ret)); } |