diff options
Diffstat (limited to 'fs/bcachefs')
135 files changed, 6026 insertions, 5386 deletions
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 5c180fdc3e..250d6c6d3a 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -282,18 +282,12 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; retry: bch2_trans_begin(trans); - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), &search, 0); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) goto err; @@ -366,7 +360,7 @@ retry: ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -414,39 +408,30 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_i_xattr *new; struct posix_acl *acl = NULL; - struct bkey_s_c k; - int ret; - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, &search, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) return bch2_err_matches(ret, ENOENT) ? 0 : ret; - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); - if (IS_ERR_OR_NULL(acl)) + if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, - __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; - new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); - if (IS_ERR(new)) { - ret = PTR_ERR(new); + struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + ret = PTR_ERR_OR_ZERO(new); + if (ret) goto err; - } new->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &new->k_i, 0); diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 534ba2b02b..658f11aebd 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -29,7 +30,7 @@ #include <linux/sched/task.h> #include <linux/sort.h> -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); +static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); /* Persistent alloc info: */ @@ -195,7 +196,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -211,7 +212,7 @@ fsck_err: } int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -225,7 +226,7 @@ fsck_err: } int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -239,7 +240,7 @@ fsck_err: } int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int ret = 0; @@ -259,11 +260,19 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + for (unsigned i = 0; i < 2; i++) + bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX, + c, err, + alloc_key_io_time_bad, + "invalid io_time[%s]: %llu, max %llu", + i == READ ? "read" : "write", + a.v->io_time[i], LRU_TIME_MAX); + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; @@ -330,27 +339,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu", a->journal_seq); - prt_newline(out); - prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_newline(out); - prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_newline(out); - prt_printf(out, "dirty_sectors %u", a->dirty_sectors); - prt_newline(out); - prt_printf(out, "cached_sectors %u", a->cached_sectors); - prt_newline(out); - prt_printf(out, "stripe %u", a->stripe); - prt_newline(out); - prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); - prt_newline(out); - prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); - prt_newline(out); - prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); - prt_newline(out); - prt_printf(out, "fragmentation %llu", a->fragmentation_lru); - prt_newline(out); - prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_printf(out, "journal_seq %llu\n", a->journal_seq); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru); + prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); printbuf_indent_sub(out, 2); } @@ -439,22 +438,18 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct b } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, - struct bpos pos) +bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct bkey_s_c k; - struct bkey_i_alloc_v4 *a; - int ret; - - k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); if (unlikely(ret)) return ERR_PTR(ret); - a = bch2_alloc_to_v4_mut_inlined(trans, k); + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); if (unlikely(ret)) goto err; @@ -464,6 +459,20 @@ err: return ERR_PTR(ret); } +__flatten +struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ERR_PTR(ret); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return unlikely(ret) ? ERR_PTR(ret) : a; +} + static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -487,7 +496,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -520,7 +529,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) int ret; ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -567,29 +576,31 @@ iter_err: int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); + struct bch_dev *ca = NULL; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; - + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_exists2(c, k.k->p.inode)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); @@ -599,15 +610,16 @@ int bch2_alloc_read(struct bch_fs *c) })); } else { ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: */ - if (!bch2_dev_bucket_exists(c, k.k->p)) + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); continue; - - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + } struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; @@ -615,6 +627,7 @@ int bch2_alloc_read(struct bch_fs *c) })); } + bch2_dev_put(ca); bch2_trans_put(trans); up_read(&c->gc_lock); @@ -625,12 +638,12 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ static int bch2_bucket_do_index(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); struct btree_iter iter; struct bkey_s_c old; struct bkey_i *k; @@ -667,7 +680,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(old); if (ret) return ret; @@ -711,8 +724,8 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) return ret; @@ -734,28 +747,27 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, int bch2_trigger_alloc(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) + struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); + if (!ca) return -EIO; - struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } @@ -765,20 +777,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + alloc_data_type_set(new_a, new_a->data_type); } if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, new.s_c, new_a, true); + ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: + bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); if (ret) - return ret; + goto err; } if (new_a->data_type == BCH_DATA_cached && !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[READ] = bch2_current_io_time(c, READ); u64 old_lru = alloc_lru_idx_read(*old_a); u64 new_lru = alloc_lru_idx_read(*new_a); @@ -787,24 +800,23 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_to_u64(new.k->p), old_lru, new_lru); if (ret) - return ret; + goto err; } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new.k->p.inode)); + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca); if (old_a->fragmentation_lru != new_a->fragmentation_lru) { ret = bch2_lru_change(trans, BCH_LRU_FRAGMENTATION_START, bucket_to_u64(new.k->p), old_a->fragmentation_lru, new_a->fragmentation_lru); if (ret) - return ret; + goto err; } if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); if (ret) - return ret; + goto err; } /* @@ -812,21 +824,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, * not: */ - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + if ((flags & BTREE_TRIGGER_bucket_invalidate) && old_a->cached_sectors) { ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, -((s64) old_a->cached_sectors)); if (ret) - return ret; + goto err; } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; u64 journal_seq = trans->journal_res.seq; u64 bucket_journal_seq = new_a->journal_seq; - if ((flags & BTREE_TRIGGER_INSERT) && + if ((flags & BTREE_TRIGGER_insert) && data_type_is_empty(old_a->data_type) != data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { @@ -854,13 +866,19 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (ret) { bch2_fs_fatal_error(c, "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); - return ret; + goto err; } } percpu_down_read(&c->mark_lock); - if (new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; + if (new_a->gen != old_a->gen) { + u8 *gen = bucket_gen(ca, new.k->p.offset); + if (unlikely(!gen)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + *gen = new_a->gen; + } bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); percpu_up_read(&c->mark_lock); @@ -874,26 +892,31 @@ int bch2_trigger_alloc(struct btree_trans *trans, closure_wake_up(&c->freelist_wait); if (statechange(a->data_type == BCH_DATA_need_discard) && - !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && bucket_flushed(new_a)) - bch2_discard_one_bucket_fast(c, new.k->p); + bch2_discard_one_bucket_fast(ca, new.k->p.offset); if (statechange(a->data_type == BCH_DATA_cached) && !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (statechange(a->data_type == BCH_DATA_need_gc_gens)) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); } - if ((flags & BTREE_TRIGGER_GC) && - (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + if ((flags & BTREE_TRIGGER_gc) && + (flags & BTREE_TRIGGER_bucket_invalidate)) { struct bch_alloc_v4 new_a_convert; const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, new.k->p.offset); + if (unlikely(!g)) { + percpu_up_read(&c->mark_lock); + goto invalid_bucket; + } + g->gen_valid = 1; bucket_lock(g); @@ -908,12 +931,19 @@ int bch2_trigger_alloc(struct btree_trans *trans, bucket_unlock(g); percpu_up_read(&c->mark_lock); } - - return 0; +err: + printbuf_exit(&buf); + bch2_dev_put(ca); + return ret; +invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); + ret = -EIO; + goto err; } /* - * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) @@ -958,35 +988,34 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos } } -static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) { - struct bch_dev *ca; - - if (bch2_dev_bucket_exists(c, *bucket)) - return true; - - if (bch2_dev_exists2(c, bucket->inode)) { - ca = bch_dev_bkey_exists(c, bucket->inode); + if (*ca) { + if (bucket->offset < (*ca)->mi.first_bucket) + bucket->offset = (*ca)->mi.first_bucket; - if (bucket->offset < ca->mi.first_bucket) { - bucket->offset = ca->mi.first_bucket; + if (bucket->offset < (*ca)->mi.nbuckets) return true; - } + bch2_dev_put(*ca); + *ca = NULL; bucket->inode++; bucket->offset = 0; } rcu_read_lock(); - ca = __bch2_next_dev_idx(c, bucket->inode, NULL); - if (ca) - *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); + if (*ca) { + *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); + bch2_dev_get(*ca); + } rcu_read_unlock(); - return ca != NULL; + return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; @@ -995,22 +1024,21 @@ again: if (bkey_err(k)) return k; + *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); + if (!k.k->type) { - struct bpos bucket = bkey_start_pos(k.k); + struct bpos hole_start = bkey_start_pos(k.k); - if (!bch2_dev_bucket_exists(c, bucket)) { - if (!next_bucket(c, &bucket)) + if (!*ca || !bucket_valid(*ca, hole_start.offset)) { + if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bucket); + bch2_btree_iter_set_pos(iter, hole_start); goto again; } - if (!bch2_dev_bucket_exists(c, k.k->p)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - - bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); - } + if (k.k->p.offset > (*ca)->mi.nbuckets) + bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); } return k; @@ -1025,24 +1053,25 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; - int ret; + int ret = 0; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, - alloc_key_to_missing_dev_bucket, + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); + if (fsck_err_on(!ca, + c, alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) - return bch2_btree_delete_at(trans, alloc_iter, 0); + ret = bch2_btree_delete_at(trans, alloc_iter, 0); + if (!ca) + return ret; - ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); if (!ca->mi.freespace_initialized) - return 0; + goto out; a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -1141,25 +1170,26 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; } +out: err: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } static noinline_for_stack int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bch_dev *ca, struct bpos start, struct bpos *end, struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; - struct bch_dev *ca; struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ca = bch_dev_bkey_exists(c, start.inode); if (!ca->mi.freespace_initialized) return 0; @@ -1313,7 +1343,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran goto delete; out: fsck_err: - set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1337,30 +1367,25 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; - struct bch_dev *ca; u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; - bool need_update = false, dev_exists; + bool need_update = false; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - /* if no bch_dev, skip out whether we repair or not */ - dev_exists = bch2_dev_exists2(c, k.k->p.inode); - if (!dev_exists) { - if (fsck_err_on(!dev_exists, c, - bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); + if (!ca) { + if (fsck_err(c, bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); - } goto out; } - ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, bucket_gens_to_invalid_buckets, @@ -1398,6 +1423,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, } out: fsck_err: + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } @@ -1406,25 +1432,26 @@ int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bch_dev *ca = NULL; struct bkey hole; struct bkey_s_c k; int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { struct bpos next; bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1445,7 +1472,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } else { next = k.k->p; - ret = bch2_check_alloc_hole_freespace(trans, + ret = bch2_check_alloc_hole_freespace(trans, ca, bkey_start_pos(k.k), &next, &freespace_iter) ?: @@ -1473,19 +1500,21 @@ bkey_err: bch2_trans_iter_exit(trans, &freespace_iter); bch2_trans_iter_exit(trans, &discard_iter); bch2_trans_iter_exit(trans, &iter); + bch2_dev_put(ca); + ca = NULL; if (ret < 0) goto err; ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, bch2_check_discard_freespace_key(trans, &iter)); if (ret) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -1515,7 +1544,7 @@ bkey_err: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: @@ -1525,13 +1554,13 @@ err: } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, lru_k; + struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; @@ -1545,6 +1574,14 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); + if (a->fragmentation_lru) { + ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + a->fragmentation_lru, + alloc_k, last_flushed); + if (ret) + return ret; + } + if (a->data_type != BCH_DATA_cached) return 0; @@ -1560,82 +1597,75 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) goto err; - a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); + &a_mut->k_i, BTREE_TRIGGER_norun); if (ret) goto err; a = &a_mut->v; } - lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]), 0); - ret = bkey_err(lru_k); + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + alloc_k, last_flushed); if (ret) - return ret; - - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, - alloc_key_to_missing_lru_entry, - "missing lru entry\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_lru_set(trans, - alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]); - if (ret) - goto err; - } + goto err; err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } -static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { int ret; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - ret = -EEXIST; + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + ret = -BCH_ERR_EEXIST_discard_in_flight_add; goto out; } - ret = darray_push(&c->discard_buckets_in_flight, bucket); + ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); out: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); return ret; } -static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) - if (bkey_eq(*i, bucket)) { - darray_remove_item(&c->discard_buckets_in_flight, i); + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) + if (i->bucket == bucket) { + BUG_ON(!i->in_progress); + darray_remove_item(&ca->discard_buckets_in_flight, i); goto found; } BUG(); found: - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); } struct discard_buckets_state { @@ -1643,28 +1673,11 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - struct bch_dev *ca; u64 need_journal_commit_this_dev; }; -static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) -{ - if (s->ca == ca) - return; - - if (s->ca && s->need_journal_commit_this_dev > - bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) - bch2_journal_flush_async(&c->journal, NULL); - - if (s->ca) - percpu_ref_put(&s->ca->ref); - if (ca) - percpu_ref_get(&ca->ref); - s->ca = ca; - s->need_journal_commit_this_dev = 0; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, struct discard_buckets_state *s) @@ -1673,21 +1686,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bpos pos = need_discard_iter->pos; struct btree_iter iter = { NULL }; struct bkey_s_c k; - struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; bool discard_locked = false; int ret = 0; - ca = bch_dev_bkey_exists(c, pos.inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { - bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); - return 0; - } - - discard_buckets_next_dev(c, s, ca); - if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; goto out; @@ -1703,7 +1706,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, need_discard_iter->pos, - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto out; @@ -1713,7 +1716,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (a->v.dirty_sectors) { + if (bch2_bucket_sectors_total(a->v)) { if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "attempting to discard bucket with dirty data\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1747,7 +1750,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; discard_locked = true; @@ -1771,8 +1774,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); write: + alloc_data_type_set(&a->v, a->v.data_type); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| @@ -1784,17 +1788,17 @@ write: s->discarded++; out: if (discard_locked) - discard_in_flight_remove(c, iter.pos); + discard_in_flight_remove(ca, iter.pos.offset); s->seen++; bch2_trans_iter_exit(trans, &iter); - percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); return ret; } static void bch2_do_discards_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1805,29 +1809,47 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - - discard_buckets_next_dev(c, &s, NULL); + for_each_btree_key_upto(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_write_ref_put(c, BCH_WRITE_REF_discard); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_discards(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_discards(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && - !queue_work(c->write_ref_wq, &c->discard_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + for_each_member_device(c, ca) + bch2_dev_do_discards(ca); } static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) { struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); int ret = bkey_err(k); if (ret) @@ -1840,7 +1862,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - a->v.data_type = alloc_data_type(a->v, a->v.data_type); + alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0); err: @@ -1850,66 +1872,69 @@ err: static void bch2_do_discards_fast_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; while (1) { bool got_bucket = false; - struct bpos bucket; - struct bch_dev *ca; + u64 bucket; - mutex_lock(&c->discard_buckets_in_flight_lock); - darray_for_each(c->discard_buckets_in_flight, i) { - if (i->snapshot) + mutex_lock(&ca->discard_buckets_in_flight_lock); + darray_for_each(ca->discard_buckets_in_flight, i) { + if (i->in_progress) continue; - ca = bch_dev_bkey_exists(c, i->inode); - - if (!percpu_ref_tryget(&ca->io_ref)) { - darray_remove_item(&c->discard_buckets_in_flight, i); - continue; - } - got_bucket = true; - bucket = *i; - i->snapshot = true; + bucket = i->bucket; + i->in_progress = true; break; } - mutex_unlock(&c->discard_buckets_in_flight_lock); + mutex_unlock(&ca->discard_buckets_in_flight_lock); if (!got_bucket) break; if (ca->mi.discard && !c->opts.nochanges) blkdev_issue_discard(ca->disk_sb.bdev, - bucket.offset * ca->mi.bucket_size, + bucket_to_sector(ca, bucket), ca->mi.bucket_size, GFP_KERNEL); int ret = bch2_trans_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, bucket)); + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); bch_err_fn(c, ret); - percpu_ref_put(&ca->io_ref); - discard_in_flight_remove(c, bucket); + discard_in_flight_remove(ca, bucket); if (ret) break; } bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + percpu_ref_put(&ca->io_ref); } -static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + struct bch_fs *c = ca->fs; - if (!percpu_ref_is_dying(&ca->io_ref) && - !discard_in_flight_add(c, bucket) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && - !queue_work(c->write_ref_wq, &c->discard_fast_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + if (discard_in_flight_add(ca, bucket, false)) + return; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +put_ioref: + percpu_ref_put(&ca->io_ref); } static int invalidate_one_bucket(struct btree_trans *trans, @@ -1918,7 +1943,6 @@ static int invalidate_one_bucket(struct btree_trans *trans, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); @@ -1936,7 +1960,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + a = bch2_trans_start_alloc_update(trans, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1958,21 +1982,18 @@ static int invalidate_one_bucket(struct btree_trans *trans, a->v.data_type = 0; a->v.dirty_sectors = 0; a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + a->v.io_time[READ] = bch2_current_io_time(c, READ); + a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); - ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: - bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; err: @@ -1997,9 +2018,25 @@ err: goto out; } +static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, + struct bch_dev *ca, bool *wrapped) +{ + struct bkey_s_c k; +again: + k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; + } + + return k; +} + static void bch2_do_invalidates_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; @@ -2007,31 +2044,63 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (ret) goto err; - for_each_member_device(c, ca) { - s64 nr_to_invalidate = - should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + struct btree_iter iter; + bool wrapped = false; - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, - lru_pos(ca->dev_idx, 0, 0), - lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), - BTREE_ITER_INTENT, k, - invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, + ((bch2_current_io_time(c, READ) + U32_MAX) & + LRU_TIME_MAX)), 0); - if (ret < 0) { - percpu_ref_put(&ca->ref); + while (true) { + bch2_trans_begin(trans); + + struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) break; - } + if (!k.k) + break; + + ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + if (ret) + break; + + bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + percpu_ref_put(&ca->io_ref); +} + +void bch2_dev_do_invalidates(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + return; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + goto put_ioref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +put_ioref: + percpu_ref_put(&ca->io_ref); } void bch2_do_invalidates(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && - !queue_work(c->write_ref_wq, &c->invalidate_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + for_each_member_device(c, ca) + bch2_dev_do_invalidates(ca); } int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, @@ -2051,7 +2120,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); /* * Scan the alloc btree for every bucket on @ca, and add buckets to the * freespace/need_discard/need_gc_gens btrees as needed: @@ -2083,7 +2152,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - ret = bch2_bucket_do_index(trans, k, a, true) ?: + ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -2155,7 +2224,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } @@ -2182,12 +2251,15 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, u64 now; int ret = 0; - a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - now = atomic64_read(&c->io_clock[rw].now); + now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; @@ -2344,16 +2416,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_fs_allocator_background_exit(struct bch_fs *c) +void bch2_dev_allocator_background_exit(struct bch_dev *ca) +{ + darray_exit(&ca->discard_buckets_in_flight); +} + +void bch2_dev_allocator_background_init(struct bch_dev *ca) { - darray_exit(&c->discard_buckets_in_flight); + mutex_init(&ca->discard_buckets_in_flight_lock); + INIT_WORK(&ca->discard_work, bch2_do_discards_work); + INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); + INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); } void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - mutex_init(&c->discard_buckets_in_flight_lock); - INIT_WORK(&c->discard_work, bch2_do_discards_work); - INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); - INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 2790e51638..ba2c5557a3 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,21 +8,18 @@ #include "debug.h" #include "super.h" -enum bkey_invalid_flags; +enum bch_validate_flags; /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - struct bch_dev *ca; - - if (!bch2_dev_exists2(c, pos.inode)) - return false; - - ca = bch_dev_bkey_exists(c, pos.inode); - return pos.offset >= ca->mi.first_bucket && - pos.offset < ca->mi.nbuckets; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, pos.inode); + bool ret = ca && bucket_valid(ca, pos.offset); + rcu_read_unlock(); + return ret; } static inline u64 bucket_to_u64(struct bpos bucket) @@ -40,38 +37,50 @@ static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) return a.gen - a.oldest_gen; } -static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, - u32 cached_sectors, - u32 stripe, - struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { - if (stripe) - return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; - if (dirty_sectors) - return data_type; - if (cached_sectors) - return BCH_DATA_cached; - if (BCH_ALLOC_V4_NEED_DISCARD(&a)) - return BCH_DATA_need_discard; - if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) - return BCH_DATA_need_gc_gens; - return BCH_DATA_free; + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; } -static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, - enum bch_data_type data_type) +static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) +{ + dst->gen = src.gen; + dst->data_type = src.data_type; + dst->dirty_sectors = src.dirty_sectors; + dst->cached_sectors = src.cached_sectors; + dst->stripe = src.stripe; +} + +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { - return __alloc_data_type(a.dirty_sectors, a.cached_sectors, - a.stripe, a, data_type); + struct bch_alloc_v4 ret = {}; + __bucket_m_to_alloc(&ret, b); + return ret; } static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) { - return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; + switch (data_type) { + case BCH_DATA_cached: + case BCH_DATA_stripe: + return BCH_DATA_user; + default: + return data_type; + } +} + +static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, + enum bch_data_type ptr) +{ + return !data_type_is_empty(bucket) && + bucket_data_type(bucket) != bucket_data_type(ptr); } -static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.dirty_sectors + a.cached_sectors; } @@ -89,6 +98,27 @@ static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, return d ? max(0, ca->mi.bucket_size - d) : 0; } +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (a.stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (a.dirty_sectors) + return data_type; + if (a.cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) +{ + a->data_type = alloc_data_type(*a, data_type); +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -111,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, !bch2_bucket_sectors_fragmented(ca, a)) return 0; - u64 d = bch2_bucket_sectors_dirty(a); + /* + * avoid overflowing LRU_TIME_BITS on a corrupted fs, when + * bucket_sectors_dirty is (much) bigger than bucket_size + */ + u64 d = min(bch2_bucket_sectors_dirty(a), + ca->mi.bucket_size); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } @@ -147,7 +183,9 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) } struct bkey_i_alloc_v4 * -bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); +bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct bpos); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); @@ -173,13 +211,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -213,7 +251,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); }) int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ @@ -233,9 +271,11 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, @@ -250,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } +void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) @@ -273,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -void bch2_fs_allocator_background_exit(struct bch_fs *); +void bch2_dev_allocator_background_exit(struct bch_dev *); +void bch2_dev_allocator_background_init(struct bch_dev *); + void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index a1fc30adf9..27d97c22ae 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -71,7 +71,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *c) { rcu_read_lock(); for_each_member_device_rcu(c, ca, NULL) - ca->alloc_cursor = 0; + memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); rcu_read_unlock(); } @@ -100,7 +100,7 @@ static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *o void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (ob->ec) { ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); @@ -300,7 +300,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (ret) { ob = ERR_PTR(ret); @@ -342,9 +342,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct bch_backpointer bp; struct bpos bp_pos = POS_MIN; - ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, + ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, &bp_pos, &bp, - BTREE_ITER_NOPRESERVE); + BTREE_ITER_nopreserve); if (ret) { ob = ERR_PTR(ret); goto err; @@ -363,10 +363,10 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: if (iter.path) - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ob; @@ -389,7 +389,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -404,9 +405,8 @@ bch2_bucket_alloc_early(struct btree_trans *trans, */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), - BTREE_ITER_SLOTS, k, ret) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; + BTREE_ITER_slots, k, ret) { + u64 bucket = k.k->p.offset; if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; @@ -415,12 +415,29 @@ again: is_superblock_bucket(ca, k.k->p.offset)) continue; - a = bch2_alloc_to_v4(k, &a_convert); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + break; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); + s->buckets_seen++; + s->skipped_mi_btree_bitmap++; + continue; + } + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); if (a->data_type != BCH_DATA_free) continue; /* now check the cached key to serialize concurrent allocs of the bucket */ - ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ret = bkey_err(ck); if (ret) break; @@ -433,7 +450,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -441,7 +458,6 @@ next: bch2_trans_iter_exit(trans, &iter); alloc_cursor = iter.pos.offset; - ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); @@ -451,6 +467,8 @@ next: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } @@ -463,7 +481,8 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -485,10 +504,30 @@ again: s->buckets_seen++; + u64 bucket = alloc_cursor & ~(~0ULL << 56); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { + if (s->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + goto fail; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); + u64 genbits = alloc_cursor >> 56; + alloc_cursor = bucket | (genbits << 56); + + if (alloc_cursor > k.k->p.offset) + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + s->skipped_mi_btree_bitmap++; + continue; + } + ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); break; } } @@ -496,10 +535,9 @@ again: if (ob || ret) break; } +fail: bch2_trans_iter_exit(trans, &iter); - ca->alloc_cursor = alloc_cursor; - if (!ob && ret) ob = ERR_PTR(ret); @@ -508,14 +546,56 @@ again: goto again; } + *dev_alloc_cursor = alloc_cursor; + return ob; } +static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + enum bch_data_type data_type, + struct closure *cl, + struct bch_dev_usage *usage, + struct bucket_alloc_state *s, + struct open_bucket *ob) +{ + struct printbuf buf = PRINTBUF; + + printbuf_tabstop_push(&buf, 24); + + prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "blocking\t%u\n", cl != NULL); + prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); + prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); + prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); + prt_printf(&buf, "open\t%llu\n", s->skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + + if (!IS_ERR(ob)) { + prt_printf(&buf, "allocated\t%llu\n", ob->bucket); + trace_bucket_alloc(c, buf.buf); + } else { + prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); + trace_bucket_alloc_fail(c, buf.buf); + } + + printbuf_exit(&buf); +} + /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object * @ca: device to allocate from * @watermark: how important is this allocation? + * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available * @usage: for secondarily also returning the current device usage * @@ -524,6 +604,7 @@ again: static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl, struct bch_dev_usage *usage) { @@ -531,20 +612,22 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { 0 }; + struct bucket_alloc_state s = { + .btree_bitmap = data_type == BCH_DATA_btree, + }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) - bch2_do_discards(c); + bch2_dev_do_discards(ca); if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) - bch2_do_gc_gens(c); + bch2_gc_gens_async(c); if (should_invalidate_buckets(ca, *usage)) - bch2_do_invalidates(c); + bch2_dev_do_invalidates(ca); if (!avail) { if (cl && !waiting) { @@ -569,6 +652,11 @@ alloc: if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); + if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { + s.btree_bitmap = BTREE_BITMAP_ANY; + goto alloc; + } + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; @@ -578,33 +666,24 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, - bch2_watermarks[watermark], - ob->bucket, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - ""); + ob->data_type = data_type; + + if (!IS_ERR(ob)) + count_event(c, bucket_alloc); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, ca, - bch2_watermarks[watermark], - 0, - usage->d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - &s, - cl == NULL, - bch2_err_str(PTR_ERR(ob))); + count_event(c, bucket_alloc_fail); + + if (!IS_ERR(ob) + ? trace_bucket_alloc_enabled() + : trace_bucket_alloc_fail_enabled()) + trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); return ob; } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_watermark watermark, + enum bch_data_type data_type, struct closure *cl) { struct bch_dev_usage usage; @@ -612,7 +691,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - cl, &usage))); + data_type, cl, &usage))); return ob; } @@ -678,8 +757,7 @@ static int add_new_bucket(struct bch_fs *c, unsigned flags, struct open_bucket *ob) { - unsigned durability = - bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned durability = ob_dev(c, ob)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); @@ -711,37 +789,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); - unsigned dev; - struct bch_dev *ca; int ret = -BCH_ERR_insufficient_devices; - unsigned i; BUG_ON(*nr_effective >= nr_replicas); - for (i = 0; i < devs_sorted.nr; i++) { + for (unsigned i = 0; i < devs_sorted.nr; i++) { struct bch_dev_usage usage; struct open_bucket *ob; - dev = devs_sorted.devs[i]; - - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + unsigned dev = devs_sorted.devs[i]; + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (!ca) continue; if (!ca->mi.durability && *have_cache) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -750,8 +819,6 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob->data_type = data_type; - if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, flags, ob)) { @@ -836,7 +903,7 @@ static bool want_bucket(struct bch_fs *c, bool *have_cache, bool ec, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); if (!test_bit(ob->dev, devs_may_alloc->d)) return false; @@ -906,7 +973,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_dev_usage usage; u64 avail; @@ -1291,7 +1358,7 @@ deallocate_extra_replicas(struct bch_fs *c, unsigned i; open_bucket_for_each(c, ptrs, ob, i) { - unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; @@ -1342,6 +1409,10 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); + ret = bch2_trans_relock(trans); + if (ret) + goto err; + /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; @@ -1444,7 +1515,7 @@ err: struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); return (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, @@ -1520,7 +1591,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ @@ -1622,3 +1693,106 @@ void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) prt_str(out, "Btree write point\n"); bch2_write_point_to_text(out, c, &c->btree_write_point); } + +void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 24); + + percpu_down_read(&c->mark_lock); + prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden)); + prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree)); + prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data)); + prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached)); + prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved)); + prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); + prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes)); + percpu_up_read(&c->mark_lock); + + prt_newline(out); + prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); + prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); + prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); + prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); +} + +void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + bch2_dev_usage_to_text(out, &stats); + + prt_newline(out); + + prt_printf(out, "reserves:\n"); + for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) + prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + + prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); + prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); +} + +void bch2_print_allocator_stuck(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n"); + + prt_printf(&buf, "Allocator debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_fs_alloc_debug_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + for_each_online_member(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + prt_printf(&buf, "Copygc debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_copygc_wait_to_text(&buf, c); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + + prt_printf(&buf, "Journal debug:\n"); + printbuf_indent_add(&buf, 2); + bch2_journal_debug_to_text(&buf, &c->journal); + printbuf_indent_sub(&buf, 2); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 7aaeec44c7..a42c9730d3 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -30,8 +30,14 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); +static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) +{ + return bch2_dev_have_ref(c, ob->dev); +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum bch_watermark, struct closure *); + enum bch_watermark, enum bch_data_type, + struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -184,7 +190,7 @@ bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, wp->sectors_allocated += sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev *ca = ob_dev(c, ob); struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); ptr.cached = cached || @@ -221,4 +227,9 @@ void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *); +void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); + +void bch2_print_allocator_stuck(struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index c2226e947c..9bbb28e90b 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -9,11 +9,18 @@ #include "fifo.h" struct bucket_alloc_state { + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; }; #define BCH_WATERMARKS() \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index af7a71de1b..6d8b1bc90b 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -23,6 +23,7 @@ static bool extent_matches_bp(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket2; struct bch_backpointer bp2; @@ -30,31 +31,43 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) + continue; + + bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) + !memcmp(&bp, &bp2, sizeof(bp))) { + rcu_read_unlock(); return true; + } } + rcu_read_unlock(); return false; } int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - /* these will be caught by fsck */ - if (!bch2_dev_exists2(c, bp.k->p.inode)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode); + if (!ca) { + /* these will be caught by fsck */ + rcu_read_unlock(); return 0; + } - struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode); - struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); + struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); + rcu_read_unlock(); int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)), + !bpos_eq(bp.k->p, bp_pos), c, err, backpointer_bucket_offset_wrong, "backpointer bucket_offset wrong"); @@ -75,10 +88,16 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - if (bch2_dev_exists2(c, k.k->p.inode)) { + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode); + if (ca) { + struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + rcu_read_unlock(); prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + bch2_bpos_to_text(out, bucket); prt_str(out, " "); + } else { + rcu_read_unlock(); } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); @@ -117,8 +136,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - prt_printf(&buf, "backpointer not found when deleting"); - prt_newline(&buf); + prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); @@ -145,6 +163,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, @@ -161,7 +180,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k->v = bp; if (!insert) { @@ -171,9 +190,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); ret = bkey_err(k); if (ret) goto err; @@ -197,13 +216,13 @@ err: * Find the next backpointer >= *bp_offset: */ int bch2_get_next_backpointer(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, int gen, struct bpos *bp_pos, struct bch_backpointer *bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; int ret = 0; @@ -213,7 +232,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, if (gen >= 0) { k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED|iter_flags); + bucket, BTREE_ITER_cached|iter_flags); ret = bkey_err(k); if (ret) goto out; @@ -223,7 +242,7 @@ int bch2_get_next_backpointer(struct btree_trans *trans, goto done; } - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, *bp_pos, iter_flags, k, ret) { @@ -249,7 +268,6 @@ static void backpointer_not_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); /* * If we're using the btree write buffer, the backpointer we were @@ -259,6 +277,10 @@ static void backpointer_not_found(struct btree_trans *trans, if (likely(!bch2_backpointers_no_use_write_buffer)) return; + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return; + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); @@ -288,15 +310,17 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, { if (likely(!bp.level)) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return bkey_s_c_err(-EIO); bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, 0, iter_flags); - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; @@ -325,18 +349,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bch_backpointer bp) { struct bch_fs *c = trans->c; - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct btree *b; BUG_ON(!bp.level); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + return ERR_PTR(-EIO); + bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, bp.level - 1, 0); - b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -367,16 +393,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - backpointer_to_missing_device, - "backpointer for missing device:\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + if (fsck_err(c, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; @@ -408,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - struct extents_to_bp_state { struct bpos bucket_start; struct bpos bucket_end; @@ -460,8 +479,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + if (!ca) return false; data_buf = kvmalloc(bytes, GFP_KERNEL); @@ -510,26 +529,25 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); + int ret = 0; - if (!bch2_dev_bucket_exists(c, bucket)) { + struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); + if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); bch2_bpos_to_text(&buf, bucket); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); - return -BCH_ERR_fsck_repair_unimplemented; + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; } if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) - return 0; + goto out; bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp.bucket_offset), + bucket_pos_to_bp(ca, bucket, bp.bucket_offset), 0); ret = bkey_err(bp_k); if (ret) @@ -537,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); - - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { - if (bp.level) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; goto check_existing_bp; } @@ -561,7 +566,7 @@ err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&tmp, c); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -637,13 +642,13 @@ missing: struct bkey_i_backpointer n_bp_k; bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); n_bp_k.v = bp; prt_printf(&buf, "\n want: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); goto out; } @@ -661,13 +666,20 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; + struct bpos bucket_pos = POS_MIN; struct bch_backpointer bp; if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (ca) + bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); + rcu_read_unlock(); + + if (!ca) + continue; ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) @@ -758,9 +770,11 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, !((1U << btree) & btree_interior_mask)) continue; + bch2_trans_begin(trans); + __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { + 0, depth, BTREE_ITER_prefetch, b, ret) { mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = @@ -794,31 +808,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, while (level >= depth) { struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - level, - BTREE_ITER_PREFETCH); - while (1) { - bch2_trans_begin(trans); - - struct bkey_s_c k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - ret = bkey_err(k) ?: - check_extent_to_backpointers(trans, s, btree_id, level, k) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - ret = 0; - continue; - } - if (ret) - break; - if (bpos_eq(iter.pos, SPOS_MAX)) - break; - bch2_btree_iter_advance(&iter); - } - bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, + BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); if (ret) return ret; @@ -887,7 +883,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c_backpointer bp, - struct bpos *last_flushed_pos) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -907,20 +903,18 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { - *last_flushed_pos = bp.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + if (!k.k) { + ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + if (ret) + goto out; - if (fsck_err_on(!k.k, c, - backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; + if (fsck_err(c, backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } } out: fsck_err: @@ -933,14 +927,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bpos last_flushed_pos = SPOS_MAX; + struct bkey_buf last_flushed; - return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_PREFETCH, k, + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), - &last_flushed_pos)); + &last_flushed)); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index c1b274eadd..6021de1c5e 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "super.h" static inline u64 swab40(u64 x) @@ -18,7 +19,7 @@ static inline u64 swab40(u64 x) } int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); @@ -36,15 +37,29 @@ void bch2_backpointer_swab(struct bkey_s); * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: */ -static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, - struct bpos bp_pos) +static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); + rcu_read_unlock(); + return ca != NULL; +} + +static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +{ + return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), + c, "backpointer for missing device %llu", bp_pos.inode); +} + static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) @@ -57,32 +72,32 @@ static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ -static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, +static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); - EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, - struct bch_backpointer, struct bkey_s_c, bool); +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, + struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); + return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); struct bkey_i_backpointer bp_k; bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k.v = bp; if (!insert) { @@ -120,7 +135,7 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, @@ -130,7 +145,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; - *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); *bp = (struct bch_backpointer) { .btree_id = btree_id, .level = level, @@ -142,7 +157,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } -int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, +int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bpos, struct bch_backpointer, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e6bcf74b0d..1106fec6e1 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -359,6 +359,8 @@ do { \ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_node_merging_disabled, \ + "Disables merging of btree nodes") \ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ "Causes mark and sweep to compact and rewrite every " \ "btree node it traverses") \ @@ -469,6 +471,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "sb-members_types.h" #include "subvolume_types.h" #include "super_types.h" #include "thread_with_file_types.h" @@ -490,9 +493,20 @@ struct io_count { u64 sectors[2][BCH_DATA_NR]; }; +struct discard_in_flight { + bool in_progress:1; + u64 bucket:63; +}; + struct bch_dev { struct kobject kobj; +#ifdef CONFIG_BCACHEFS_DEBUG + atomic_long_t ref; + bool dying; + unsigned long last_put; +#else struct percpu_ref ref; +#endif struct completion ref_completion; struct percpu_ref io_ref; struct completion io_ref_completion; @@ -518,14 +532,11 @@ struct bch_dev { struct bch_devs_mask self; - /* biosets used in cloned bios for writing multiple replicas */ - struct bio_set replica_set; - /* * Buckets: * Per-bucket arrays are protected by c->mark_lock, bucket_lock and * gc_lock, for device resize - holding any is sufficient for access: - * Or rcu_read_lock(), but only for ptr_stale(): + * Or rcu_read_lock(), but only for dev_ptr_stale(): */ struct bucket_array __rcu *buckets_gc; struct bucket_gens __rcu *bucket_gens; @@ -539,7 +550,7 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - u64 alloc_cursor; + u64 alloc_cursor[3]; unsigned nr_open_buckets; unsigned nr_btree_reserve; @@ -548,6 +559,12 @@ struct bch_dev { size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct discard_in_flight) discard_buckets_in_flight; + struct work_struct discard_fast_work; + atomic64_t rebalance_work; struct journal_device journal; @@ -585,12 +602,12 @@ struct bch_dev { x(clean_shutdown) \ x(fsck_running) \ x(initial_gc_unfixed) \ - x(need_another_gc) \ x(need_delete_dead_snapshots) \ x(error) \ x(topology_error) \ x(errors_fixed) \ - x(errors_not_fixed) + x(errors_not_fixed) \ + x(no_invalid_checks) enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -673,6 +690,7 @@ struct btree_trans_buf { x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ + x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) @@ -783,7 +801,8 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; - struct workqueue_struct *io_complete_wq; + struct workqueue_struct *btree_read_complete_wq; + struct workqueue_struct *btree_write_submit_wq; struct btree_root btree_roots_known[BTREE_ID_NR]; DARRAY(struct btree_root) btree_roots_extra; @@ -884,7 +903,6 @@ struct bch_fs { /* JOURNAL SEQ BLACKLIST */ struct journal_seq_blacklist_table * journal_seq_blacklist_table; - struct work_struct journal_seq_blacklist_gc_work; /* ALLOCATOR */ spinlock_t freelist_lock; @@ -908,15 +926,9 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct invalidate_work; - struct work_struct discard_work; - struct mutex discard_buckets_in_flight_lock; - DARRAY(struct bpos) discard_buckets_in_flight; - struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ - struct task_struct *gc_thread; - atomic_t kick_gc; + struct work_struct gc_gens_work; unsigned long gc_count; enum btree_id gc_gens_btree; @@ -946,6 +958,7 @@ struct bch_fs { struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; + struct bio_set replica_set; struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; struct bucket_nocow_lock_table @@ -1073,7 +1086,6 @@ struct bch_fs { u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; bool promote_whole_extents; @@ -1208,9 +1220,9 @@ static inline s64 bch2_current_time(const struct bch_fs *c) return timespec_to_bch2_time(c, now); } -static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw) { - return dev < c->sb.nr_devices && c->devs[dev]; + return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX); } static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 2e8b1a489c..e3b1bde489 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -76,6 +76,7 @@ #include <asm/byteorder.h> #include <linux/kernel.h> #include <linux/uuid.h> +#include <uapi/linux/magic.h> #include "vstructs.h" #ifdef __KERNEL__ @@ -475,6 +476,9 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -502,16 +506,22 @@ struct bch_sb_field { #include "alloc_background_format.h" #include "extents_format.h" -#include "reflink_format.h" #include "ec_format.h" -#include "inode_format.h" #include "dirent_format.h" -#include "xattr_format.h" -#include "quota_format.h" +#include "disk_groups_format.h" +#include "inode_format.h" +#include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" +#include "quota_format.h" +#include "reflink_format.h" +#include "replicas_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-downgrade_format.h" +#include "sb-errors_format.h" +#include "sb-members_format.h" +#include "xattr_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -544,100 +554,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; -/* BCH_SB_FIELD_members_v1: */ - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; -}; - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - /* BCH_SB_FIELD_crypt: */ struct nonce { @@ -686,8 +602,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -/* BCH_SB_FIELD_replicas: */ - #define BCH_DATA_TYPES() \ x(free, 0) \ x(sb, 1) \ @@ -730,50 +644,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[]; -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[]; -} __packed; - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_disk_groups: */ - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -801,27 +671,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; @@ -829,18 +678,6 @@ struct bch_sb_field_ext { __le64 btrees_lost_data; }; -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - /* Superblock: */ /* @@ -901,7 +738,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ #define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ @@ -1154,8 +990,9 @@ enum bch_version_upgrade_opts { #define BCH_ERROR_ACTIONS() \ x(continue, 0) \ - x(ro, 1) \ - x(panic, 2) + x(fix_safe, 1) \ + x(panic, 2) \ + x(ro, 3) enum bch_error_actions { #define x(t, n) BCH_ON_ERROR_##t = n, @@ -1283,7 +1120,7 @@ enum bch_compression_opts { UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -#define BCACHEFS_STATFS_MAGIC 0xca451a4e +#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) @@ -1547,6 +1384,13 @@ enum btree_id { BTREE_ID_NR }; +/* + * Maximum number of btrees that we will _ever_ have under the current scheme, + * where we refer to them with 64 bit bitfields - and we also need a bit for + * the interior btree node type: + */ +#define BTREE_ID_NR_MAX 63 + static inline bool btree_id_is_alloc(enum btree_id id) { switch (id) { diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 76e79a15ba..587d7318a2 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -640,7 +640,7 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) int bch2_bkey_format_invalid(struct bch_fs *c, struct bkey_format *f, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { unsigned i, bits = KEY_PACKED_BITS_START; @@ -656,20 +656,18 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * unpacked format: */ for (i = 0; i < f->nr_fields; i++) { - if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && + bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + unsigned packed_bits = min(64, f->bits_per_field[i]); + u64 packed_max = packed_bits + ? ~((~0ULL << 1) << (packed_bits - 1)) : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) { - prt_printf(err, "field %u too large: %llu + %llu > %llu", - i, packed_max, field_offset, unpacked_max); - return -BCH_ERR_invalid; - } + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max); + return -BCH_ERR_invalid; } bits += f->bits_per_field[i]; @@ -1067,7 +1065,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) { const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; u8 *l = k->key_start; - u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1; while (l < h) { swap(*l, *h); diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 3a45d128f6..936357149c 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -9,10 +9,10 @@ #include "util.h" #include "vstructs.h" -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), +enum bch_validate_flags { + BCH_VALIDATE_write = (1U << 0), + BCH_VALIDATE_commit = (1U << 1), + BCH_VALIDATE_journal = (1U << 2), }; #if 0 @@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); @@ -574,8 +581,31 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); + +static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) +{ + unsigned f_bits = f->bits_per_field[i]; + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f_bits > unpacked_bits) + return true; + + if ((f_bits == unpacked_bits) && field_offset) + return true; + + u64 f_mask = f_bits + ? ~((~0ULL << (f_bits - 1)) << 1) + : 0; + + if (((field_offset + f_mask) & unpacked_mask) < field_offset) + return true; + return false; +} + int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index a275a9e8e3..bd32aac051 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -27,7 +27,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -41,7 +41,7 @@ static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -58,7 +58,7 @@ fsck_err: }) static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -82,7 +82,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { return 0; } @@ -123,9 +123,12 @@ const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); int ret = 0; @@ -159,9 +162,12 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + int ret = 0; bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, @@ -172,7 +178,7 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) && + (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -224,7 +230,7 @@ fsck_err: int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return __bch2_bkey_invalid(c, k, type, flags, err) ?: @@ -392,8 +398,12 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, for (i = 0; i < nr_compat; i++) switch (!write ? i : nr_compat - 1 - i) { case 0: - if (big_endian != CPU_BIG_ENDIAN) + if (big_endian != CPU_BIG_ENDIAN) { + bch2_bkey_swab_key(f, k); + } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { bch2_bkey_swab_key(f, k); + bch2_bkey_swab_key(f, k); + } break; case 1: if (version < bcachefs_metadata_version_bkey_renumber) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 03efe8ee56..baef0722f5 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -22,14 +22,15 @@ extern const struct bkey_ops bch2_bkey_null_ops; */ struct bkey_ops { int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err); + enum bch_validate_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -48,11 +49,11 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) } int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, struct printbuf *); @@ -76,56 +77,10 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, - __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, - __BTREE_TRIGGER_TRANSACTIONAL, - __BTREE_TRIGGER_ATOMIC, - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_BUCKET_INVALIDATE, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -/* Don't run triggers at all */ -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -/* - * If set, we're running transactional triggers as part of a transaction commit: - * triggers may generate new updates - * - * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, - * we're running atomic triggers during a transaction commit: we have our - * journal reservation, we're holding btree node write locks, and we know the - * transaction is going to commit (returning an error here is a fatal error, - * causing us to go emergency read-only) - */ -#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) -#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) - -/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) - -/* @new is entering the btree */ -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) - -/* @old is leaving the btree */ -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -/* signal from bucket invalidate path to alloc trigger */ -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) - static inline int bch2_key_trigger(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); @@ -135,8 +90,9 @@ static inline int bch2_key_trigger(struct btree_trans *trans, } static inline int bch2_key_trigger_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -144,12 +100,13 @@ static inline int bch2_key_trigger_old(struct btree_trans *trans, deleted.k.p = old.k->p; return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), - BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_overwrite|flags); } static inline int bch2_key_trigger_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s new, unsigned flags) + enum btree_id btree_id, unsigned level, + struct bkey_s new, + enum btree_iter_update_trigger_flags flags) { struct bkey_i deleted; @@ -157,7 +114,7 @@ static inline int bch2_key_trigger_new(struct btree_trans *trans, deleted.k.p = new.k->p; return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + BTREE_TRIGGER_insert|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); @@ -172,7 +129,8 @@ static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, struct bkey_packed *k) { if (version < bcachefs_metadata_version_current || - big_endian != CPU_BIG_ENDIAN) + big_endian != CPU_BIG_ENDIAN || + IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) __bch2_bkey_compat(level, btree_id, version, big_endian, write, f, k); diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index bcca9e76a0..4536eb50fc 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -6,9 +6,9 @@ #include "bset.h" #include "extents.h" -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); +typedef int (*sort_cmp_fn)(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); static inline bool sort_iter_end(struct sort_iter *iter) { @@ -70,9 +70,9 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, /* * If keys compare equal, compare by pointer order: */ -static inline int key_sort_fix_overlapping_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int key_sort_fix_overlapping_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); @@ -154,46 +154,59 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static inline int keep_unwritten_whiteouts_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { return bch2_bkey_cmp_packed_inlined(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; + (long) l - (long) r; } -unsigned bch2_sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) +#include "btree_update_interior.h" + +/* + * For sorting in the btree node write path: whiteouts not in the unwritten + * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are + * dropped if overwritten by real keys: + */ +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter) { - const struct bkey_format *f = &iter->b->format; struct bkey_packed *in, *next, *out = dst; - sort_iter_sort(iter, sort_keys_cmp); + sort_iter_sort(iter, keep_unwritten_whiteouts_cmp); - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - bool needs_whiteout = false; + while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) { + if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b)) + continue; - if (bkey_deleted(in) && - (filter_whiteouts || !in->needs_whiteout)) + if ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) continue; - while ((next = sort_iter_peek(iter)) && - !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - needs_whiteout |= in->needs_whiteout; - in = sort_iter_next(iter, sort_keys_cmp); - } + bkey_p_copy(out, in); + out = bkey_p_next(out); + } - if (bkey_deleted(in)) { - memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_p_copy(out, in); - } - out->needs_whiteout |= needs_whiteout; + return (u64 *) out - (u64 *) dst; +} + +/* + * Main sort routine for compacting a btree node in memory: we always drop + * whiteouts because any whiteouts that need to be written are in the unwritten + * whiteouts area: + */ +unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined); + + while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) { + if (bkey_deleted(in)) + continue; + + bkey_p_copy(out, in); out = bkey_p_next(out); } diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h index 7c0f0b160f..9be969d468 100644 --- a/fs/bcachefs/bkey_sort.h +++ b/fs/bcachefs/bkey_sort.h @@ -48,7 +48,7 @@ bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -unsigned bch2_sort_keys(struct bkey_packed *, - struct sort_iter *, bool); +unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *); +unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *); #endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3bb477840e..575e1d0b6e 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -103,8 +103,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - console_lock(); for_each_bset(b, t) bch2_dump_bset(c, b, bset(b, t), t - b->set); @@ -136,7 +134,6 @@ void bch2_dump_btree_node_iter(struct btree *b, struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { - struct bset_tree *t; struct bkey_packed *k; struct btree_nr_keys nr = {}; @@ -198,7 +195,6 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; - struct bset_tree *t; if (bch2_btree_node_iter_end(iter)) return; @@ -213,12 +209,14 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { for_each_bset(b, t) - if (set->end == t->end_offset) + if (set->end == t->end_offset) { + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); goto found; + } BUG(); found: - BUG_ON(set->k < btree_bkey_first_offset(t) || - set->k >= t->end_offset); + do {} while (0); } /* Verify iterator is sorted: */ @@ -377,11 +375,9 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(const struct btree *b) +static void bset_aux_tree_verify(struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct bset_tree *t; - for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -685,20 +681,20 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -1374,8 +1370,6 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, struct btree *b) { - struct bset_tree *t; - memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) @@ -1481,7 +1475,6 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, { struct bkey_packed *k, *prev = NULL; struct btree_node_iter_set *set; - struct bset_tree *t; unsigned end = 0; if (bch2_expensive_debug_checks) @@ -1550,9 +1543,7 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) { - const struct bset_tree *t; - - for_each_bset(b, t) { + for_each_bset_c(b, t) { enum bset_aux_tree_type type = bset_aux_tree_type(t); size_t j; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 120a79fd45..5c6c7a14fa 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -206,7 +206,10 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) } #define for_each_bset(_b, _t) \ - for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define for_each_bset_c(_b, _t) \ + for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ @@ -294,7 +297,6 @@ static inline struct bset_tree * bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) { unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; for_each_bset(b, t) if (offset <= t->end_offset) { diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 02c70e813f..4f5e411771 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -16,6 +16,12 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed_##counter++; \ +} while (0) + const char * const bch2_btree_node_flags[] = { #define x(f) #f, BTREE_FLAGS() @@ -85,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch_btree_cache_params = { - .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, hash_val), - .key_len = sizeof(u64), - .obj_cmpfn = bch2_btree_cache_cmp_fn, + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, + .automatic_shrinking = true, }; static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) @@ -162,6 +169,9 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + if (b->c.btree_id < BTREE_ID_NR) + --bc->used_by_btree[b->c.btree_id]; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -169,8 +179,11 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) BUG_ON(b->hash_val); b->hash_val = btree_ptr_hash_val(&b->key); - return rhashtable_lookup_insert_fast(&bc->table, &b->hash, - bch_btree_cache_params); + int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); + if (!ret && b->c.btree_id < BTREE_ID_NR) + bc->used_by_btree[b->c.btree_id]++; + return ret; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, @@ -190,6 +203,35 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, return ret; } +void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) @@ -203,7 +245,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -225,38 +267,64 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); return -BCH_ERR_ENOMEM_btree_node_reclaim; + } - if (!six_trylock_write(&b->c.lock)) + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); goto out_unlock_intent; + } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); goto out_unlock; + } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b) || - btree_node_write_blocked(b) || - btree_node_will_make_reachable(b)) + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + goto out_unlock; + } if (btree_node_dirty(b)) { - if (!flush) + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); goto out_unlock; + } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -286,14 +354,14 @@ out_unlock_intent: goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) { - return __btree_node_reclaim(c, b, false); + return __btree_node_reclaim(c, b, false, shrinker_counter); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true); + return __btree_node_reclaim(c, b, true, false); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -341,11 +409,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, true)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; + bc->freed++; } } restart: @@ -354,9 +423,11 @@ restart: if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - } else if (!btree_node_reclaim(c, b)) { + bc->not_freed_access_bit++; + } else if (!btree_node_reclaim(c, b, true)) { freed++; btree_node_data_free(c, b); + bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); @@ -564,7 +635,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b)) + if (!btree_node_reclaim(c, b, false)) return b; while (1) { @@ -600,7 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b, false)) { list_del_init(&b->list); goto got_node; } @@ -626,7 +697,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2)) { + if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); @@ -846,7 +917,6 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; bool need_relock = false; int ret; @@ -966,7 +1036,6 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * { struct bch_fs *c = trans->c; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1043,7 +1112,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - struct bset_tree *t; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); @@ -1240,9 +1308,39 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, + const char *label, unsigned nr) { - prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); + prt_printf(out, "%s\t", label); + prt_human_readable_u64(out, nr * c->opts.btree_node_size); + prt_printf(out, " (%u)\n", nr); +} + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_btree_cache_line(out, c, "total:", bc->used); + prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + + prt_newline(out); + prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "not freed:\n"); + prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); + prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); + prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); + prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); + prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); + prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); + prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); + prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); + prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6d33885fdb..fed35de3e4 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,6 +17,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *); + void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); @@ -131,6 +134,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) const char *bch2_btree_id_str(enum btree_id); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 58f7c99e0e..a0deb82660 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -52,12 +52,6 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) }}}; } -static bool should_restart_for_topology_repair(struct bch_fs *c) -{ - return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); -} - static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -69,7 +63,7 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0); __gc_pos_set(c, new_pos); } @@ -97,35 +91,6 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_i *new) -{ - struct bch_fs *c = trans->c; - struct btree *b; - struct bkey_buf tmp; - int ret; - - bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_reassemble(&tmp, c, old); - - b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); - if (!IS_ERR_OR_NULL(b)) { - mutex_lock(&c->btree_cache.lock); - - bch2_btree_node_hash_remove(&c->btree_cache, b); - - bkey_copy(&b->key, new); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - - mutex_unlock(&c->btree_cache.lock); - six_unlock_read(&b->c.lock); - } - - bch2_bkey_buf_exit(&tmp, c); -} - static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { struct bkey_i_btree_ptr_v2 *new; @@ -546,9 +511,9 @@ reconstruct_root: if (!bch2_btree_has_scanned_nodes(c, i)) { mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { - bch2_btree_root_alloc_fake(c, i, 1); + bch2_btree_root_alloc_fake_trans(trans, i, 1); bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); if (ret) @@ -576,7 +541,7 @@ reconstruct_root: goto reconstruct_root; bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); - bch2_btree_root_alloc_fake(c, i, 0); + bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } @@ -586,495 +551,138 @@ fsck_err: return ret; } -static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k) +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, struct btree **prev, + struct btree_iter *iter, struct bkey_s_c k, + bool initial) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); - const union bch_extent_entry *entry_c; - struct extent_ptr_decoded p = { 0 }; - bool do_update = false; - struct printbuf buf = PRINTBUF; - int ret = 0; - - /* - * XXX - * use check_bucket_ref here - */ - bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); - - if (fsck_err_on(!g->gen_valid, - c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - } else { - do_update = true; - } - } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (!p.ptr.cached) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, - c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, - c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) - continue; + if (iter) { + struct btree_path *path = btree_iter_path(trans, iter); + struct btree *b = path_l(path)->b; - if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != - bucket_data_type(data_type), c, - ptr_bucket_data_type_mismatch, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->data_type = data_type; - set_bit(BCH_FS_need_another_gc, &c->flags); - } else { - do_update = true; - } - } - - if (p.has_ec) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); - - if (fsck_err_on(!m || !m->alive, c, - ptr_to_missing_stripe, - "pointer to nonexistent stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; - - if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, - ptr_to_incorrect_stripe, - "pointer does not match stripe %llu\n" - "while marking %s", - (u64) p.ec.idx, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) - do_update = true; + if (*prev != b) { + int ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; } + *prev = b; } - if (do_update) { - if (is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - - struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); - if (!new) { - ret = -BCH_ERR_ENOMEM_gc_repair_key; - bch_err_msg(c, ret, "allocating new key"); - goto err; - } - - bkey_reassemble(new, *k); - - if (level) { - /* - * We don't want to drop btree node pointers - if the - * btree node isn't there anymore, the read path will - * sort it out: - */ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - } else { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; -restart_drop_ptrs: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); - - if ((p.ptr.cached && - (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || - (!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0) || - gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type)) { - bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); - goto restart_drop_ptrs; - } - } -again: - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - bkey_extent_entry_for_each(ptrs, entry) { - if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct gc_stripe *m = genradix_ptr(&c->gc_stripes, - entry->stripe_ptr.idx); - union bch_extent_entry *next_ptr; - - bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) - if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) - goto found; - next_ptr = NULL; -found: - if (!next_ptr) { - bch_err(c, "aieee, found stripe ptr with no data ptr"); - continue; - } - - if (!m || !m->alive || - !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], - &next_ptr->ptr, - m->sectors)) { - bch2_bkey_extent_entry_drop(new, entry); - goto again; - } - } - } - } - - if (level) - bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); - - if (0) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, *k); - bch_info(c, "updated %s", buf.buf); - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); - bch_info(c, "new key %s", buf.buf); - } - - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } - - *k = bkey_i_to_s_c(new); - } -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -/* marking of btree keys/nodes: */ - -static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, - unsigned level, bool is_root, - struct bkey_s_c *k, - bool initial) -{ - struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; struct printbuf buf = PRINTBUF; int ret = 0; - deleted.p = k->k->p; + deleted.p = k.k->p; if (initial) { BUG_ON(bch2_journal_seq_verify && - k->k->version.lo > atomic64_read(&c->journal.seq)); + k.k->version.lo > atomic64_read(&c->journal.seq)); - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, - "key version number higher than recorded: %llu > %llu", - k->k->version.lo, - atomic64_read(&c->key_version))) - atomic64_set(&c->key_version, k->k->version.lo); + "key version number higher than recorded %llu\n %s", + atomic64_read(&c->key_version), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + atomic64_set(&c->key_version, k.k->version.lo); } - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - - if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k), + if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, *k), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); - bch2_dev_btree_bitmap_mark(c, *k); + bch2_dev_btree_bitmap_mark(c, k); bch2_write_super(c); mutex_unlock(&c->sb_lock); } - ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, - unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); -fsck_err: -err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) -{ - struct btree_node_iter iter; - struct bkey unpacked; - struct bkey_s_c k; - int ret = 0; + /* + * We require a commit before key_trigger() because + * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the + * wrong result if we run it multiple times. + */ + unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0; - ret = bch2_btree_node_check_topology(trans, b); + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_check_repair|flags); if (ret) - return ret; - - if (!btree_node_type_needs_gc(btree_node_type(b))) - return 0; - - bch2_btree_node_iter_init_from_start(&iter, b); - - while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, - &k, initial); - if (ret) - return ret; + goto out; - bch2_btree_node_iter_advance(&iter, b); + if (trans->nr_updates) { + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } - return 0; + ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k), + BTREE_TRIGGER_gc|flags); +out: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, - bool initial, bool metadata_only) +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct btree *b; - unsigned depth = metadata_only ? 1 : 0; + int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - - __for_each_btree_node(trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b, ret) { - bch2_verify_btree_nr_keys(b); - - gc_pos_set(c, gc_pos_btree_node(b)); + /* We need to make sure every leaf node is readable before going RW */ + if (initial) + target_depth = 0; - ret = btree_gc_mark_node(trans, b, initial); + /* root */ + do { +retry_root: + bch2_trans_begin(trans); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, + 0, bch2_btree_id_root(c, btree)->b->c.level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + goto err_root; - if (ret) - return ret; + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } - mutex_lock(&c->btree_root_lock); - b = bch2_btree_id_root(c, btree_id)->b; - if (!btree_node_fake(b)) { + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); + level = b->c.level; +err_root: + bch2_trans_iter_exit(trans, &iter); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - true, &k, initial); - } - gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); - mutex_unlock(&c->btree_root_lock); - - return ret; -} - -static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, - unsigned target_depth) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf cur; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = bch2_btree_node_check_topology(trans, b); if (ret) return ret; - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&cur); + for (; level >= target_depth; --level) { + struct btree *prev = NULL; + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level, + BTREE_ITER_prefetch); - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_lt(k.k->p, b->data->min_key)); - BUG_ON(bpos_gt(k.k->p, b->data->max_key)); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, - false, &k, true); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); if (ret) - goto fsck_err; - - bch2_btree_and_journal_iter_advance(&iter); - } - - if (b->c.level > target_depth) { - bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - iter.prefetch = true; - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - struct btree *child; - - bch2_bkey_buf_reassemble(&cur, c, k); - bch2_btree_and_journal_iter_advance(&iter); - - child = bch2_btree_node_get_noiter(trans, cur.k, - b->c.btree_id, b->c.level - 1, - false); - ret = PTR_ERR_OR_ZERO(child); - - if (bch2_err_matches(ret, EIO)) { - bch2_topology_error(c); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_read_error, - "Unreadable btree node at btree %s level %u:\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto fsck_err; - } else { - /* Continue marking when opted to not - * fix the error: */ - ret = 0; - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - continue; - } - } else if (ret) { - bch_err_msg(c, ret, "getting btree node"); - break; - } - - ret = bch2_gc_btree_init_recurse(trans, child, - target_depth); - six_unlock_read(&child->c.lock); - - if (ret) - break; - } - } -fsck_err: - bch2_bkey_buf_exit(&cur, c); - bch2_btree_and_journal_iter_exit(&iter); - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_btree_init(struct btree_trans *trans, - enum btree_id btree_id, - bool metadata_only) -{ - struct bch_fs *c = trans->c; - struct btree *b; - unsigned target_depth = metadata_only ? 1 : 0; - struct printbuf buf = PRINTBUF; - int ret = 0; - - b = bch2_btree_id_root(c, btree_id)->b; - - six_lock_read(&b->c.lock, NULL, NULL); - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->min_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, - btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->data->max_key); - if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, - btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf)) { - bch_err(c, "repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto fsck_err; - } - - if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(trans, b, target_depth); - - if (!ret) { - struct bkey_s_c k = bkey_i_to_s_c(&b->key); - - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, - &k, true); + break; } -fsck_err: - six_unlock_read(&b->c.lock); - bch_err_fn(c, ret); - printbuf_exit(&buf); return ret; } @@ -1083,7 +691,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) return cmp_int(gc_btree_order(l), gc_btree_order(r)); } -static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; @@ -1094,98 +702,36 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < BTREE_ID_NR && !ret; i++) - ret = initial - ? bch2_gc_btree_init(trans, ids[i], metadata_only) - : bch2_gc_btree(trans, ids[i], initial, metadata_only); + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + unsigned btree = i < BTREE_ID_NR ? ids[i] : i; - for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { - if (!bch2_btree_id_root(c, i)->alive) + if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = initial - ? bch2_gc_btree_init(trans, i, metadata_only) - : bch2_gc_btree(trans, i, initial, metadata_only); - } + ret = bch2_gc_btree(trans, btree, true); + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), + c, btree_node_read_error, + "btree node read error for %s", + bch2_btree_id_str(btree))) + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } +fsck_err: bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } -static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - unsigned flags) -{ - u64 b = sector_to_bucket(ca, start); - - do { - unsigned sectors = - min_t(u64, bucket_to_sector(ca, b + 1), end) - start; - - bch2_mark_metadata_bucket(c, ca, b, type, sectors, - gc_phase(GC_PHASE_sb), flags); - b++; - start += sectors; - } while (start < end); -} - -static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - unsigned i; - u64 b; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - - if (offset == BCH_SB_SECTOR) - mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, flags); - - mark_metadata_sectors(c, ca, offset, - offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, flags); - } - - for (i = 0; i < ca->journal.nr; i++) { - b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_sb), flags); - } -} - -static void bch2_mark_superblocks(struct bch_fs *c) +static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_sb)); - for_each_online_member(c, ca) - bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); + return ret; } -#if 0 -/* Also see bch2_pending_btree_node_free_insert_done() */ -static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -{ - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&c->btree_interior_update_lock); - gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); - - for_each_pending_btree_node_free(c, as, d) - if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); - - mutex_unlock(&c->btree_interior_update_lock); -} -#endif - static void bch2_gc_free(struct bch_fs *c) { genradix_free(&c->reflink_gc_table); @@ -1203,28 +749,23 @@ static void bch2_gc_free(struct bch_fs *c) c->usage_gc = NULL; } -static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) +static int bch2_gc_done(struct bch_fs *c) { struct bch_dev *ca = NULL; struct printbuf buf = PRINTBUF; - bool verify = !metadata_only && - !c->opts.reconstruct_alloc && - (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); -#define copy_field(_err, _f, _msg, ...) \ - if (dst->_f != src->_f && \ - (!verify || \ - fsck_err(c, _err, _msg ": got %llu, should be %llu" \ - , ##__VA_ARGS__, dst->_f, src->_f))) \ +#define copy_field(_err, _f, _msg, ...) \ + if (fsck_err_on(dst->_f != src->_f, c, _err, \ + _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \ + dst->_f, src->_f)) \ dst->_f = src->_f -#define copy_dev_field(_err, _f, _msg, ...) \ +#define copy_dev_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) -#define copy_fs_field(_err, _f, _msg, ...) \ +#define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) @@ -1257,31 +798,24 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(fs_usage_btree_wrong, b.btree, "btree"); - if (!metadata_only) { - copy_fs_field(fs_usage_data_wrong, - b.data, "data"); - copy_fs_field(fs_usage_cached_wrong, - b.cached, "cached"); - copy_fs_field(fs_usage_reserved_wrong, - b.reserved, "reserved"); - copy_fs_field(fs_usage_nr_inodes_wrong, - b.nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(fs_usage_persistent_reserved_wrong, - persistent_reserved[i], - "persistent_reserved[%i]", i); - } + copy_fs_field(fs_usage_data_wrong, + b.data, "data"); + copy_fs_field(fs_usage_cached_wrong, + b.cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + b.reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + b.nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, e); @@ -1295,10 +829,8 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_stripe_field #undef copy_field fsck_err: - if (ca) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err_fn(c, ret); - percpu_up_write(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -1321,7 +853,7 @@ static int bch2_gc_start(struct bch_fs *c) ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return -BCH_ERR_ENOMEM_gc_start; } @@ -1332,19 +864,6 @@ static int bch2_gc_start(struct bch_fs *c) return 0; } -static int bch2_gc_reset(struct bch_fs *c) -{ - for_each_member_device(c, ca) { - free_percpu(ca->usage_gc); - ca->usage_gc = NULL; - } - - free_percpu(c->usage_gc); - c->usage_gc = NULL; - - return bch2_gc_start(c); -} - /* returns true if not equal */ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, struct bch_alloc_v4 r) @@ -1360,55 +879,45 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - bool metadata_only) + struct bch_dev *ca, + struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old_convert, new; + struct bch_alloc_v4 old_gc, gc, old_convert, new; const struct bch_alloc_v4 *old; int ret; + if (!bucket_valid(ca, k.k->p.offset)) + return 0; + old = bch2_alloc_to_v4(k, &old_convert); - new = *old; + gc = new = *old; percpu_down_read(&c->mark_lock); - b = gc_bucket(ca, iter->pos.offset); - old_gc = *b; + __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); + + old_gc = gc; if ((old->data_type == BCH_DATA_sb || old->data_type == BCH_DATA_journal) && !bch2_dev_is_online(ca)) { - b->data_type = old->data_type; - b->dirty_sectors = old->dirty_sectors; + gc.data_type = old->data_type; + gc.dirty_sectors = old->dirty_sectors; } /* - * b->data_type doesn't yet include need_discard & need_gc_gen states - + * gc.data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - b->data_type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - gc = *b; + alloc_data_type_set(&gc, gc.data_type); if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) - bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); percpu_up_read(&c->mark_lock); - if (metadata_only && - gc.data_type != BCH_DATA_sb && - gc.data_type != BCH_DATA_journal && - gc.data_type != BCH_DATA_btree) - return 0; - - if (gen_after(old->gen, gc.gen)) - return 0; + gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca); if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, @@ -1423,23 +932,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans, #define copy_bucket_field(_errtype, _f) \ if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ + ": got %llu, should be %llu", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ bch2_data_type_str(gc.data_type), \ - new._f, gc._f)) \ + (u64) new._f, (u64) gc._f)) \ new._f = gc._f; \ - copy_bucket_field(alloc_key_gen_wrong, - gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, - dirty_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, - cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, - stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, - stripe_redundancy); + copy_bucket_field(alloc_key_gen_wrong, gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); + copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -1453,18 +958,18 @@ static int bch2_alloc_write_key(struct btree_trans *trans, a->v = new; /* - * The trigger normally makes sure this is set, but we're not running + * The trigger normally makes sure these are set, but we're not running * triggers: */ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun); fsck_err: return ret; } -static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_done(struct bch_fs *c) { int ret = 0; @@ -1473,11 +978,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + BTREE_ITER_slots|BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - bch2_alloc_write_key(trans, &iter, k, metadata_only))); + bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); break; } } @@ -1486,71 +991,50 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) return ret; } -static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +static int bch2_gc_alloc_start(struct bch_fs *c) { for_each_member_device(c, ca) { struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); bch_err(c, "error allocating ca->buckets[gc]"); return -BCH_ERR_ENOMEM_gc_alloc_start; } buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = ca->mi.nbuckets; + buckets->nbuckets_minus_first = + buckets->nbuckets - buckets->first_bucket; rcu_assign_pointer(ca->buckets_gc, buckets); } + struct bch_dev *ca = NULL; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - struct bucket *g = gc_bucket(ca, k.k->p.offset); - - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); - - g->gen_valid = 1; - g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; + BTREE_ITER_prefetch, k, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; } + if (bucket_valid(ca, k.k->p.offset)) { + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + struct bucket *g = gc_bucket(ca, k.k->p.offset); + g->gen_valid = 1; + g->gen = a->gen; + } 0; }))); + bch2_dev_put(ca); bch_err_fn(c, ret); return ret; } -static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) -{ - for_each_member_device(c, ca) { - struct bucket_array *buckets = gc_bucket_array(ca); - struct bucket *g; - - for_each_bucket(g, buckets) { - if (metadata_only && - (g->data_type == BCH_DATA_user || - g->data_type == BCH_DATA_cached || - g->data_type == BCH_DATA_parity)) - continue; - g->data_type = 0; - g->dirty_sectors = 0; - g->cached_sectors = 0; - } - } -} - static int bch2_gc_write_reflink_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1600,35 +1084,27 @@ fsck_err: return ret; } -static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_reflink_done(struct bch_fs *c) { size_t idx = 0; - if (metadata_only) - return 0; - int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; return ret; } -static int bch2_gc_reflink_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_reflink_start(struct bch_fs *c) { - - if (metadata_only) - return 0; - c->reflink_gc_nr = 0; int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ const __le64 *refcount = bkey_refcount_c(k); if (!refcount) @@ -1651,15 +1127,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, return ret; } -static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) -{ - struct genradix_iter iter; - struct reflink_gc *r; - - genradix_for_each(&c->reflink_gc_table, iter, r) - r->refcount = 0; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1713,30 +1180,20 @@ fsck_err: return ret; } -static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +static int bch2_gc_stripes_done(struct bch_fs *c) { - if (metadata_only) - return 0; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_gc_write_stripes_key(trans, &iter, k))); } -static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) -{ - genradix_free(&c->gc_stripes); -} - /** - * bch2_gc - walk _all_ references to buckets, and recompute them: + * bch2_check_allocations - walk all references to buckets, and recompute them: * * @c: filesystem object - * @initial: are we in recovery? - * @metadata_only: are we just checking metadata references, or everything? * * Returns: 0 on success, or standard errcode on failure * @@ -1755,9 +1212,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +int bch2_check_allocations(struct bch_fs *c) { - unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); @@ -1767,62 +1223,30 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) bch2_btree_interior_updates_flush(c); ret = bch2_gc_start(c) ?: - bch2_gc_alloc_start(c, metadata_only) ?: - bch2_gc_reflink_start(c, metadata_only); + bch2_gc_alloc_start(c) ?: + bch2_gc_reflink_start(c); if (ret) goto out; -again: - gc_pos_set(c, gc_phase(GC_PHASE_start)); - bch2_mark_superblocks(c); + gc_pos_set(c, gc_phase(GC_PHASE_start)); - ret = bch2_gc_btrees(c, initial, metadata_only); + ret = bch2_mark_superblocks(c); + BUG_ON(ret); + ret = bch2_gc_btrees(c); if (ret) goto out; -#if 0 - bch2_mark_pending_btree_node_frees(c); -#endif c->gc_count++; - if (test_bit(BCH_FS_need_another_gc, &c->flags) || - (!iter && bch2_test_restart_gc)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto out; - } - - /* - * XXX: make sure gens we fixed got saved - */ - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_need_another_gc, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); - - bch2_gc_stripes_reset(c, metadata_only); - bch2_gc_alloc_reset(c, metadata_only); - bch2_gc_reflink_reset(c, metadata_only); - ret = bch2_gc_reset(c); - if (ret) - goto out; - - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - goto again; - } + bch2_journal_block(&c->journal); out: - if (!ret) { - bch2_journal_block(&c->journal); + ret = bch2_gc_alloc_done(c) ?: + bch2_gc_done(c) ?: + bch2_gc_stripes_done(c) ?: + bch2_gc_reflink_done(c); - ret = bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only) ?: - bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only); - - bch2_journal_unblock(&c->journal); - } + bch2_journal_unblock(&c->journal); percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ @@ -1851,23 +1275,33 @@ static int gc_btree_gens_key(struct btree_trans *trans, struct bkey_i *u; int ret; + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + percpu_down_read(&c->mark_lock); + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (ptr_stale(ca, ptr) > 16) { + if (dev_ptr_stale(ca, ptr) > 16) { + rcu_read_unlock(); percpu_up_read(&c->mark_lock); goto update; } } bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; if (gen_after(*gen, ptr->gen)) *gen = ptr->gen; } + rcu_read_unlock(); percpu_up_read(&c->mark_lock); return 0; update: @@ -1880,10 +1314,9 @@ update: return 0; } -static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, + struct btree_iter *iter, struct bkey_s_c k) { - struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); struct bkey_i_alloc_v4 *a_mut; @@ -1898,7 +1331,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); + alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } @@ -1926,7 +1359,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } @@ -1944,7 +1377,7 @@ int bch2_gc_gens(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -1953,14 +1386,23 @@ int bch2_gc_gens(struct bch_fs *c) goto err; } + struct bch_dev *ca = NULL; ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, + BTREE_ITER_prefetch, k, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_alloc_write_oldest_gen(trans, &iter, k))); + BCH_TRANS_COMMIT_no_enospc, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + bch2_alloc_write_oldest_gen(trans, ca, &iter, k); + }))); + bch2_dev_put(ca); + if (ret) goto err; @@ -1984,87 +1426,23 @@ err: return ret; } -static int bch2_gc_thread(void *arg) +static void bch2_gc_gens_work(struct work_struct *work) { - struct bch_fs *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic64_read(&clock->now); - unsigned last_kick = atomic_read(&c->kick_gc); - - set_freezable(); - - while (1) { - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - return 0; - } - - if (atomic_read(&c->kick_gc) != last_kick) - break; - - if (c->btree_gc_periodic) { - unsigned long next = last + c->capacity / 16; - - if (atomic64_read(&clock->now) >= next) - break; - - bch2_io_clock_schedule_timeout(clock, next); - } else { - schedule(); - } - - try_to_freeze(); - } - __set_current_state(TASK_RUNNING); - - last = atomic64_read(&clock->now); - last_kick = atomic_read(&c->kick_gc); - - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - ret = bch2_gc(c, false, false); -#else - bch2_gc_gens(c); -#endif - debug_check_no_locks_held(); - } - - return 0; + struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); + bch2_gc_gens(c); + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -void bch2_gc_thread_stop(struct bch_fs *c) +void bch2_gc_gens_async(struct bch_fs *c) { - struct task_struct *p; - - p = c->gc_thread; - c->gc_thread = NULL; - - if (p) { - kthread_stop(p); - put_task_struct(p); - } + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + !queue_work(c->write_ref_wq, &c->gc_gens_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); } -int bch2_gc_thread_start(struct bch_fs *c) +void bch2_fs_gc_init(struct bch_fs *c) { - struct task_struct *p; + seqcount_init(&c->gc_pos_lock); - if (c->gc_thread) - return 0; - - p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); - if (IS_ERR(p)) { - bch_err_fn(c, PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - c->gc_thread = p; - wake_up_process(p); - return 0; + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 5c97eb6a38..876d81e201 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -7,10 +7,7 @@ #include "btree_types.h" int bch2_check_topology(struct bch_fs *); -int bch2_gc(struct bch_fs *, bool, bool); -int bch2_gc_gens(struct bch_fs *); -void bch2_gc_thread_stop(struct bch_fs *); -int bch2_gc_thread_start(struct bch_fs *); +int bch2_check_allocations(struct bch_fs *); /* * For concurrent mark and sweep (with other index updates), we define a total @@ -39,8 +36,8 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) return (struct gc_pos) { .phase = phase, }; } -static inline struct gc_pos gc_pos_btree(enum btree_id btree, - struct bpos pos, unsigned level) +static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, + struct bpos pos) { return (struct gc_pos) { .phase = GC_PHASE_btree, @@ -56,19 +53,7 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, */ static inline struct gc_pos gc_pos_btree_node(struct btree *b) { - return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -} - -/* - * GC position of the pointer to a btree root: we don't use - * gc_pos_pointer_to_btree_node() here to avoid a potential race with - * btree_split() increasing the tree depth - the new root will have level > the - * old root and thus have a greater gc position than the old root, but that - * would be incorrect since once gc has marked the root it's not coming back. - */ -static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -{ - return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); + return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } static inline int gc_btree_order(enum btree_id btree) @@ -100,11 +85,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) return ret; } -static inline void bch2_do_gc_gens(struct bch_fs *c) -{ - atomic_inc(&c->kick_gc); - if (c->gc_thread) - wake_up_process(c->gc_thread); -} +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_gens_async(struct bch_fs *); +void bch2_fs_gc_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index debb0edc34..7bca15c604 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -23,6 +23,18 @@ #include <linux/sched/mm.h> +static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) +{ + prt_printf(out, "btree=%s l=%u seq %llux\n", + bch2_btree_id_str(BTREE_NODE_ID(bn)), + (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); + prt_str(out, "min: "); + bch2_bpos_to_text(out, bn->min_key); + prt_newline(out); + prt_str(out, "max: "); + bch2_bpos_to_text(out, bn->max_key); +} + void bch2_btree_node_io_unlock(struct btree *b) { EBUG_ON(!btree_node_write_in_flight(b)); @@ -217,7 +229,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { - struct bset_tree *t; bool ret = false; for_each_bset(b, t) { @@ -288,8 +299,7 @@ bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, static void btree_node_sort(struct bch_fs *c, struct btree *b, unsigned start_idx, - unsigned end_idx, - bool filter_whiteouts) + unsigned end_idx) { struct btree_node *out; struct sort_iter_stack sort_iter; @@ -320,7 +330,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter); out->keys.u64s = cpu_to_le16(u64s); @@ -426,13 +436,12 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) break; if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, unwritten_idx, - b->nsets, false); + btree_node_sort(c, b, unwritten_idx, b->nsets); ret = true; } if (unwritten_idx > 1) { - btree_node_sort(c, b, 0, unwritten_idx, false); + btree_node_sort(c, b, 0, unwritten_idx); ret = true; } @@ -441,8 +450,6 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b) void bch2_btree_build_aux_trees(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, !bset_written(b, bset(b, t)) && @@ -512,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, + struct btree *b, struct bset *i, struct bkey_packed *k, unsigned offset, int write) { prt_printf(out, bch2_log_msg(c, "%s"), @@ -524,28 +531,36 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u/%u", + printbuf_indent_add(out, 2); + + prt_printf(out, "\nnode offset %u/%u", b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + if (k) + prt_printf(out, " bset byte offset %lu", + (unsigned long)(void *)k - + ((unsigned long)(void *)i & ~511UL)); prt_str(out, ": "); } -__printf(9, 10) +__printf(10, 11) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, + struct bkey_packed *k, int write, bool have_retry, enum bch_sb_error_id err_type, const char *fmt, ...) { struct printbuf out = PRINTBUF; + bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; - btree_err_msg(&out, c, ca, b, i, b->written, write); + btree_err_msg(&out, c, ca, b, i, k, b->written, write); va_start(args, fmt); prt_vprintf(&out, fmt, args); @@ -564,12 +579,14 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (ret != -BCH_ERR_btree_node_read_err_fixable) + if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) bch2_sb_error_count(c, err_type); switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + ret = !silent + ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf) + : -BCH_ERR_fsck_fix; if (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore) goto fsck_err; @@ -577,14 +594,17 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_want_retry: case -BCH_ERR_btree_node_read_err_must_retry: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); break; case -BCH_ERR_btree_node_read_err_bad_node: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: - bch2_print_string_as_lines(KERN_ERR, out.buf); + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; default: @@ -596,9 +616,9 @@ fsck_err: return ret; } -#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ BCH_FSCK_ERR_##_err_type, \ msg, ##__VA_ARGS__); \ \ @@ -619,8 +639,6 @@ fsck_err: __cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k; @@ -677,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_version_compatible(version), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), @@ -685,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { @@ -698,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -710,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; @@ -726,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_empty, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_wrong_sector_offset, "bset at wrong sector offset"); @@ -748,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_level, "incorrect level"); @@ -780,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -791,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), @@ -803,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), @@ -870,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -879,14 +897,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) goto drop_this_key; if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_u64s, "bad k->u64s %u (min %u max %zu)", k->u64s, bkeyp_key_u64s(&b->format, k), @@ -908,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); goto drop_this_key; @@ -929,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&buf, u.k); if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_out_of_order, "%s", buf.buf)) goto drop_this_key; @@ -998,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); @@ -1019,20 +1037,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, - "got wrong btree node (want %llx got %llx)\n" - "got btree %s level %llu pos %s", - bp->seq, b->data->keys.seq, - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data), - buf.buf); + "got wrong btree node: got\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, - "bad btree header: seq 0"); + "bad btree header: seq 0\n%s", + (printbuf_reset(&buf), + bch2_btree_node_header_to_text(&buf, b->data), + buf.buf)); } while (b->written < (ptr_written ?: btree_sectors(c))) { @@ -1046,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1059,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1074,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -1088,19 +1107,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); csum_bad = bch2_crc_cmp(bne->csum, csum); - if (csum_bad) + if (ca && csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1138,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), @@ -1164,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); @@ -1177,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(bne->keys.journal_seq), true), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1221,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "%s", buf.buf); @@ -1249,12 +1268,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); + rcu_read_lock(); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } + rcu_read_unlock(); if (!ptr_written) set_btree_node_need_rewrite(b); @@ -1279,8 +1300,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; struct btree *b = rb->b; - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; struct printbuf buf = PRINTBUF; @@ -1292,8 +1313,8 @@ static void btree_node_read_work(struct work_struct *work) while (1) { retry = true; bch_info(c, "retrying read"); - ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + rb->have_ioref = ca != NULL; bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1307,7 +1328,7 @@ static void btree_node_read_work(struct work_struct *work) start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) @@ -1363,12 +1384,12 @@ static void btree_node_read_endio(struct bio *bio) struct bch_fs *c = rb->c; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } struct btree_node_read_all { @@ -1455,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) written2 = btree_node_sectors_written(c, ra->buf[i]); if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; @@ -1560,7 +1581,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct btree_node_read_all *ra = rb->ra; if (rb->have_ioref) { - struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); } @@ -1602,14 +1623,14 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; rb->b = b; rb->ra = ra; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->idx = i; rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; @@ -1635,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, - c->io_complete_wq); + c->btree_read_complete_wq); } return 0; @@ -1679,7 +1700,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1691,7 +1712,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, rb->b = b; rb->ra = NULL; rb->start_time = local_clock(); - rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->have_ioref = ca != NULL; rb->pick = pick; INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; @@ -1716,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(c->io_complete_wq, &rb->work); + queue_work(c->btree_read_complete_wq, &rb->work); } } @@ -1846,7 +1867,6 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; - struct bch_extent_ptr *ptr; int ret = 0; btree_bounce_free(c, @@ -1896,13 +1916,14 @@ static void btree_node_write_endio(struct bio *bio) struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; unsigned long flags; if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + if (!ca || + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { @@ -1969,7 +1990,6 @@ static void btree_write_submit(struct work_struct *work) void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) { struct btree_write_bio *wbio; - struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; @@ -2095,11 +2115,11 @@ do_write: unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); - b->whiteout_u64s = 0; - - u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); + u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter); le16_add_cpu(&i->u64s, u64s); + b->whiteout_u64s = 0; + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); set_needs_whiteout(i, false); @@ -2209,7 +2229,7 @@ do_write: atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); - queue_work(c->io_complete_wq, &wbio->work); + queue_work(c->btree_write_submit_wq, &wbio->work); return; err: set_btree_node_noevict(b); @@ -2226,7 +2246,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) { bool invalidated_iter = false; struct btree_node_entry *bne; - struct bset_tree *t; if (!btree_node_just_written(b)) return false; @@ -2249,7 +2268,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * single bset: */ if (b->nsets > 1) { - btree_node_sort(c, b, 0, b->nsets, true); + btree_node_sort(c, b, 0, b->nsets); invalidated_iter = true; } else { invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); @@ -2346,20 +2365,13 @@ void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 10); - prt_tab(out); - prt_str(out, "nr"); - prt_tab(out); - prt_str(out, "size"); - prt_newline(out); + prt_printf(out, "\tnr\tsize\n"); for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { u64 nr = atomic64_read(&c->btree_write_stats[i].nr); u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); - prt_printf(out, "%s:", bch2_btree_write_types[i]); - prt_tab(out); - prt_u64(out, nr); - prt_tab(out); + prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr); prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); prt_newline(out); } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e251cb6b96..2b8b564fc5 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -81,8 +81,6 @@ static inline bool should_compact_bset_lazy(struct btree *b, static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { - struct bset_tree *t; - for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) return bch2_compact_whiteouts(c, b, COMPACT_LAZY); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 2a211a4beb..19352a08ea 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -61,7 +61,7 @@ static inline int btree_path_cmp(const struct btree_path *l, static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_successor(p); } else { p = bpos_nosnap_successor(p); @@ -74,7 +74,7 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { /* Are we iterating over keys in all snapshots? */ - if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_all_snapshots) { p = bpos_predecessor(p); } else { p = bpos_nosnap_predecessor(p); @@ -88,7 +88,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && !bkey_eq(pos, POS_MAX)) pos = bkey_successor(iter, pos); return pos; @@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; - unsigned i; - - EBUG_ON(path->btree_id >= BTREE_ID_NR); - for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && bch2_btree_id_root(c, path->btree_id)->b->c.level > i); @@ -251,15 +248,13 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - BUG_ON(iter->btree_id >= BTREE_ID_NR); - - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); - BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + BUG_ON((iter->flags & BTREE_ITER_is_extents) && + (iter->flags & BTREE_ITER_all_snapshots)); - BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && - (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) && + (iter->flags & BTREE_ITER_all_snapshots) && !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) @@ -269,10 +264,10 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || @@ -289,7 +284,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (!bch2_debug_check_iterators) return 0; - if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; if (bkey_err(k) || !k.k) @@ -300,8 +295,8 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k k.k->p.snapshot)); bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_nopreserve| + BTREE_ITER_all_snapshots); prev = bch2_btree_iter_prev(©); if (!prev.k) goto out; @@ -332,6 +327,8 @@ out: void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) { + bch2_trans_verify_not_unlocked(trans); + struct btree_path *path; struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; @@ -897,7 +894,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); @@ -944,7 +941,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, k); - if ((flags & BTREE_ITER_PREFETCH) && + if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) @@ -999,6 +996,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); + trans_set_locked(trans); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1162,6 +1160,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); + unsigned max_level = path->level; EBUG_ON(btree_path_node(path, path->level) && !btree_node_locked(path, path->level)); @@ -1192,6 +1191,16 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } + + if (unlikely(max_level > path->level)) { + struct btree_path *linked; + unsigned iter; + + trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter) + for (unsigned j = path->level + 1; j < max_level; j++) + linked->l[j] = path->l[j]; + } + out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: @@ -1221,11 +1230,14 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, - bool intent) + bool intent, unsigned long ip) { btree_path_idx_t new = btree_path_alloc(trans, src); btree_path_copy(trans, trans->paths + new, trans->paths + src); __btree_path_get(trans->paths + new, intent); +#ifdef TRACK_PATH_ALLOCATED + trans->paths[new].ip_allocated = ip; +#endif return new; } @@ -1234,7 +1246,7 @@ btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, btree_path_idx_t path, bool intent, unsigned long ip) { __btree_path_put(trans->paths + path, intent); - path = btree_path_clone(trans, path, intent); + path = btree_path_clone(trans, path, intent, ip); trans->paths[path].preserve = false; return path; } @@ -1334,6 +1346,26 @@ static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t __clear_bit(path, trans->paths_allocated); } +static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path) +{ + unsigned l = path->level; + + do { + if (!btree_path_node(path, l)) + break; + + if (!is_btree_node(path, l)) + return false; + + if (path->l[l].lock_seq != path->l[l].b->c.lock.seq) + return false; + + l++; + } while (l < path->locks_want); + + return true; +} + void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { struct btree_path *path = trans->paths + path_idx, *dup; @@ -1348,10 +1380,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) return; - if (path->should_be_locked && - !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup))) - return; + if (path->should_be_locked && !trans->restarted) { + if (!dup) + return; + + if (!(trans->locked + ? bch2_btree_path_relock_norestart(trans, dup) + : bch2_btree_path_can_relock(trans, dup))) + return; + } if (dup) { dup->preserve |= path->preserve; @@ -1384,22 +1421,26 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) (void *) trans->last_restarted_ip); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +{ + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); +} + noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - prt_printf(buf, "transaction updates for %s journal seq %llu", + prt_printf(buf, "transaction updates for %s journal seq %llu\n", trans->fn, trans->journal_res.seq); - prt_newline(buf); printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS", + prt_printf(buf, "update: btree=%s cached=%u %pS\n", bch2_btree_id_str(i->btree_id), i->cached, (void *) i->ip_allocated); - prt_newline(buf); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1428,23 +1469,63 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', + path->cached ? 'C' : 'B', bch2_btree_id_str(path->btree_id), path->level); bch2_bpos_to_text(out, path->pos); - prt_printf(out, " locks %u", path->nodes_locked); #ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif +} + +static const char *btree_node_locked_str(enum btree_node_locked_type t) +{ + switch (t) { + case BTREE_NODE_UNLOCKED: + return "unlocked"; + case BTREE_NODE_READ_LOCKED: + return "read"; + case BTREE_NODE_INTENT_LOCKED: + return "intent"; + case BTREE_NODE_WRITE_LOCKED: + return "write"; + default: + return NULL; + } +} + +void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) +{ + bch2_btree_path_to_text_short(out, trans, path_idx); + + struct btree_path *path = trans->paths + path_idx; + + prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { + prt_printf(out, "l=%u locks %s seq %u node ", l, + btree_node_locked_str(btree_node_locked_type(path, l)), + path->l[l].lock_seq); + + int ret = PTR_ERR_OR_ZERO(path->l[l].b); + if (ret) + prt_str(out, bch2_err_str(ret)); + else + prt_printf(out, "%px", path->l[l].b); + prt_newline(out); + } + printbuf_indent_sub(out, 2); } static noinline __cold @@ -1456,8 +1537,10 @@ void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_idx_inorder(trans, iter) - bch2_btree_path_to_text(out, trans, iter.path_idx); + trans_for_each_path_idx_inorder(trans, iter) { + bch2_btree_path_to_text_short(out, trans, iter.path_idx); + prt_newline(out); + } } noinline __cold @@ -1608,11 +1691,12 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, unsigned flags, unsigned long ip) { struct btree_path *path; - bool cached = flags & BTREE_ITER_CACHED; - bool intent = flags & BTREE_ITER_INTENT; + bool cached = flags & BTREE_ITER_cached; + bool intent = flags & BTREE_ITER_intent; struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); @@ -1657,7 +1741,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, trans->paths_sorted = false; } - if (!(flags & BTREE_ITER_NOPRESERVE)) + if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; if (path->intent_ref) @@ -1678,6 +1762,22 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, return path_idx; } +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_nopreserve| + BTREE_ITER_intent, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path_idx; +} + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) { @@ -1719,6 +1819,19 @@ hole: return (struct bkey_s_c) { u, NULL }; } + +void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + if (!iter->path || trans->restarted) + return; + + struct btree_path *path = btree_iter_path(trans, iter); + path->preserve = false; + if (path->ref == 1) + path->should_be_locked = false; +} /* Btree iterators: */ int __must_check @@ -1733,9 +1846,11 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; + bch2_trans_verify_not_unlocked(trans); + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); @@ -1774,7 +1889,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: @@ -1835,13 +1950,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (bpos_eq(iter->pos, b->key.k.p)) { __btree_path_set_level_up(trans, path, path->level++); } else { + if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, path->level + 1); + /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ iter->path = bch2_btree_path_set_pos(trans, iter->path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); path = btree_iter_path(trans, iter); @@ -1859,7 +1977,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->k.p = iter->pos = b->key.k.p; iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); EBUG_ON(btree_iter_path(trans, iter)->uptodate); @@ -1878,11 +1996,11 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, SPOS_MAX) : bkey_eq(pos, SPOS_MAX)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -1891,11 +2009,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + bool ret = !(iter->flags & BTREE_ITER_all_snapshots ? bpos_eq(pos, POS_MIN) : bkey_eq(pos, POS_MIN)); - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); bch2_btree_iter_set_pos(iter, pos); return ret; @@ -2006,7 +2124,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); + + if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) return bkey_s_c_null; @@ -2015,17 +2136,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, - iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL, + iter->flags & BTREE_ITER_intent, 0, + iter->flags|BTREE_ITER_cached| + BTREE_ITER_cached_nofill, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - iter->flags|BTREE_ITER_CACHED) ?: + iter->flags|BTREE_ITER_cached) ?: bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); @@ -2053,7 +2174,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct btree_path_level *l; iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2078,7 +2199,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k = btree_path_level_peek_all(trans->c, l, &iter->k); - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; @@ -2089,10 +2210,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + if (unlikely(iter->flags & BTREE_ITER_with_journal)) k = btree_trans_peek_journal(trans, iter, k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_updates(trans, iter, &k); @@ -2144,11 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + bch2_trans_verify_not_unlocked(trans); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2171,7 +2293,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * isn't monotonically increasing before FILTER_SNAPSHOTS, and * that's what we check against in extents mode: */ - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(k.k->p, end) : k.k->p.inode > end.inode)) goto end; @@ -2179,13 +2301,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && - (iter->flags & BTREE_ITER_INTENT) && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && + (iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && !iter->update_path) { struct bpos pos = k.k->p; @@ -2200,12 +2322,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, iter->update_path, pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); if (unlikely(ret)) { @@ -2218,7 +2340,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * We can never have a key in a leaf node at POS_MAX, so * we don't have to check these successor() calls: */ - if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + if ((iter->flags & BTREE_ITER_filter_snapshots) && !bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { @@ -2227,7 +2349,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e } if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2237,12 +2359,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * equal to the key we just returned - except extents can * straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (!(iter->flags & BTREE_ITER_is_extents)) iter_pos = k.k->p; else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ? bkey_gt(iter_pos, end) : bkey_ge(iter_pos, end))) goto end; @@ -2253,7 +2375,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->pos = iter_pos; iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); btree_path_set_should_be_locked(btree_iter_path(trans, iter)); @@ -2266,7 +2388,7 @@ out_no_locked: btree_path_set_should_be_locked(trans->paths + iter->update_path); } - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; ret = bch2_btree_iter_verify_ret(iter, k); @@ -2316,21 +2438,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_path_idx_t saved_path = 0; int ret; + bch2_trans_verify_not_unlocked(trans); EBUG_ON(btree_iter_path(trans, iter)->cached || btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_WITH_JOURNAL) + if (iter->flags & BTREE_ITER_with_journal) return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2345,17 +2468,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || - ((iter->flags & BTREE_ITER_IS_EXTENTS) + ((iter->flags & BTREE_ITER_is_extents) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (iter->flags & BTREE_ITER_filter_snapshots) { if (k.k->p.snapshot == iter->snapshot) goto got_key; @@ -2366,7 +2489,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; iter->k = saved_k; @@ -2379,9 +2502,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) k.k->p.snapshot)) { if (saved_path) bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent, + _THIS_IP_); path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; @@ -2392,9 +2516,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) } got_key: if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + !(iter->flags & BTREE_ITER_all_snapshots)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) search_key.snapshot = U32_MAX; continue; } @@ -2418,11 +2542,11 @@ got_key: if (bkey_lt(k.k->p, iter->pos)) iter->pos = k.k->p; - if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2452,12 +2576,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; + bch2_trans_verify_not_unlocked(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); /* extents can't span inode numbers: */ - if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; @@ -2467,7 +2592,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) search_key = btree_iter_search_key(iter); iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2476,22 +2601,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - if ((iter->flags & BTREE_ITER_CACHED) || - !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + if ((iter->flags & BTREE_ITER_cached) || + !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; - if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) { bch2_btree_trans_peek_slot_updates(trans, iter, &k); if (k.k) goto out; } - if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + if (unlikely(iter->flags & BTREE_ITER_with_journal) && (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; @@ -2506,12 +2631,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bpos next; struct bpos end = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) end.offset = U64_MAX; EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_INTENT) { + if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); @@ -2542,7 +2667,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bkey_init(&iter->k); iter->k.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (iter->flags & BTREE_ITER_is_extents) { bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, (next.inode == iter->pos.inode @@ -2726,13 +2851,13 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; @@ -2757,9 +2882,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - flags |= BTREE_ITER_NOT_EXTENTS; - flags |= __BTREE_ITER_ALL_SNAPSHOTS; - flags |= BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_not_extents; + flags |= BTREE_ITER_snapshot_field; + flags |= BTREE_ITER_all_snapshots; bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, __bch2_btree_iter_flags(trans, btree_id, flags), @@ -2782,9 +2907,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->ip_allocated = _RET_IP_; #endif if (src->path) - __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent); if (src->update_path) - __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent); dst->key_cache_path = 0; } @@ -2953,7 +3078,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!trans->restarted && (need_resched() || time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { - drop_locks_do(trans, (cond_resched(), 0)); + bch2_trans_unlock(trans); + cond_resched(); now = local_clock(); } trans->last_begin_time = now; @@ -2963,11 +3089,15 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; + + trans_set_locked(trans); + if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } + bch2_trans_verify_not_unlocked(trans); return trans->restart_count; } @@ -3001,7 +3131,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -3020,16 +3149,11 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) */ BUG_ON(pos_task && pid == pos_task->pid && - bch2_trans_locked(pos)); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } + pos->locked); } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; @@ -3037,7 +3161,7 @@ got_trans: trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); trans->nr_paths = ARRAY_SIZE(trans->_paths); trans->paths_allocated = trans->_paths_allocated; @@ -3069,6 +3193,9 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + trans_set_locked(trans); + + closure_init_stack_release(&trans->ref); return trans; } @@ -3105,7 +3232,6 @@ void bch2_trans_put(struct btree_trans *trans) trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); trans->nr_updates = 0; - trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3126,6 +3252,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + /* + * trans->ref protects trans->locking_wait.task, btree_paths array; used + * by cycle detector + */ + closure_return_sync(&trans->ref); + trans->locking_wait.task = NULL; + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3143,8 +3276,6 @@ void bch2_trans_put(struct btree_trans *trans) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3166,13 +3297,11 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_tab(out); - prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', b->level, bch2_btree_id_str(b->btree_id)); bch2_bpos_to_text(out, btree_node_pos(b)); - prt_tab(out); - prt_printf(out, " locks %u:%u:%u held by pid %u", + prt_printf(out, "\t locks %u:%u:%u held by pid %u", c.n[0], c.n[1], c.n[2], pid); } @@ -3229,10 +3358,8 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_printf(out, " blocked for %lluus on", - div_u64(local_clock() - trans->locking_wait.start_time, - 1000)); - prt_newline(out); + prt_printf(out, " blocked for %lluus on\n", + div_u64(local_clock() - trans->locking_wait.start_time, 1000)); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); @@ -3254,8 +3381,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); @@ -3275,8 +3400,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) bch2_time_stats_exit(&s->lock_hold_times); } - if (c->btree_trans_barrier_initialized) + if (c->btree_trans_barrier_initialized) { + synchronize_srcu_expedited(&c->btree_trans_barrier); cleanup_srcu_struct(&c->btree_trans_barrier); + } mempool_exit(&c->btree_trans_mem_pool); mempool_exit(&c->btree_trans_pool); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 1c70836dd7..798eb1c479 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -216,9 +216,13 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { + bch2_trans_verify_not_unlocked(trans); + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -227,6 +231,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); +btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, + unsigned, struct bpos); + struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); /* @@ -283,7 +290,6 @@ int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); void bch2_trans_unlock_long(struct btree_trans *); -bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) { @@ -309,6 +315,14 @@ static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) bch2_trans_in_restart_error(trans); } +void __noreturn bch2_trans_unlocked_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) +{ + if (!trans->locked) + bch2_trans_unlocked_error(trans); +} + __always_inline static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) { @@ -386,10 +400,10 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + iter->flags & BTREE_ITER_intent); iter->update_path = 0; - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(iter->flags & BTREE_ITER_all_snapshots)) new_pos.snapshot = iter->snapshot; __bch2_btree_iter_set_pos(iter, new_pos); @@ -397,7 +411,7 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { - BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + BUG_ON(!(iter->flags & BTREE_ITER_is_extents)); iter->pos = bkey_start_pos(&iter->k); } @@ -416,20 +430,20 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) - flags |= BTREE_ITER_IS_EXTENTS; + flags |= BTREE_ITER_is_extents; - if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_snapshot_field) && !btree_type_has_snapshot_field(btree_id)) - flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + flags &= ~BTREE_ITER_all_snapshots; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + if (!(flags & BTREE_ITER_all_snapshots) && btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_FILTER_SNAPSHOTS; + flags |= BTREE_ITER_filter_snapshots; if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_WITH_JOURNAL; + flags |= BTREE_ITER_with_journal; return flags; } @@ -439,10 +453,10 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, unsigned flags) { if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; return __bch2_btree_iter_flags(trans, btree_id, flags); } @@ -494,18 +508,7 @@ void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, unsigned, unsigned, unsigned); void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -static inline void set_btree_iter_dontneed(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - - if (!iter->path || trans->restarted) - return; - - struct btree_path *path = btree_iter_path(trans, iter); - path->preserve = false; - if (path->ref == 1) - path->should_be_locked = false; -} +void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -619,14 +622,14 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -634,7 +637,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * struct bpos end, unsigned flags) { - if (!(flags & BTREE_ITER_SLOTS)) + if (!(flags & BTREE_ITER_slots)) return bch2_btree_iter_peek_upto(iter, end); if (bkey_gt(iter->pos, end)) @@ -699,16 +702,12 @@ transaction_restart: \ _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ +#define for_each_btree_key_upto_continue(_trans, _iter, \ + _end, _flags, _k, _do) \ ({ \ - struct btree_iter _iter; \ struct bkey_s_c _k; \ int _ret3 = 0; \ \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ do { \ _ret3 = lockrestart_do(_trans, ({ \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ @@ -724,6 +723,21 @@ transaction_restart: \ _ret3; \ }) +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ + for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + bch2_trans_begin(trans); \ + \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ +}) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ @@ -794,14 +808,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, return k; } -#define for_each_btree_key_old(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -832,7 +838,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ - _do ?: bch2_trans_relock(_trans); \ + (_do) ?: bch2_trans_relock(_trans); \ }) #define allocate_dropping_locks_errcode(_trans, _do) \ @@ -861,6 +867,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, }) void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1e8cf49a69..332dbf1649 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -623,3 +623,20 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, keys->data[dst++] = *i; keys->nr = keys->gap = dst; } + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + } + printbuf_exit(&buf); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index af25046ebc..1ba4a79b0e 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -70,4 +70,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, unsigned, unsigned, struct bpos, struct bpos); +void bch2_journal_keys_dump(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 7dafa1acce..2d3c0d45c3 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params bch2_btree_key_cache_params = { - .head_offset = offsetof(struct bkey_cached, hash), - .key_offset = offsetof(struct bkey_cached, key), - .key_len = sizeof(struct bkey_cached_key), - .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, + .automatic_shrinking = true, }; __flatten @@ -383,9 +384,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); - iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -424,16 +425,16 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; } - if (!bch2_btree_node_relock(trans, ck_path, 0)) { + ret = bch2_trans_relock(trans); + if (ret) { kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } - ret = bch2_trans_relock(trans); - if (ret) { + if (!bch2_btree_node_relock(trans, ck_path, 0)) { kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } } @@ -456,7 +457,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(&iter); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -515,23 +516,10 @@ retry: fill: path->uptodate = BTREE_ITER_UPTODATE; - if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { - /* - * Using the underscore version because we haven't set - * path->uptodate yet: - */ - if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { - trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); - goto err; - } - - ret = btree_key_cache_fill(trans, path, ck); - if (ret) - goto err; - - ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) { + ret = bch2_btree_path_upgrade(trans, path, 1) ?: + btree_key_cache_fill(trans, path, ck) ?: + bch2_btree_path_relock(trans, path, _THIS_IP_); if (ret) goto err; @@ -622,13 +610,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_slots| + BTREE_ITER_intent| + BTREE_ITER_all_snapshots); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + BTREE_ITER_cached| + BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; ret = bch2_btree_iter_traverse(&c_iter); if (ret) @@ -661,14 +649,14 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, commit_flags |= BCH_WATERMARK_reclaim; if (ck->journal.seq != journal_last_seq(j) || - !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) + !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_KEY_CACHE_RECLAIM| - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - BTREE_TRIGGER_NORUN) ?: + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| @@ -790,7 +778,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, * flushing. The flush callback will not proceed unless ->seq matches * the latest pin, so make sure it starts with a consistent value. */ - if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || !journal_pin_active(&ck->journal)) { ck->seq = trans->journal_res.seq; } @@ -835,6 +823,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, int srcu_idx; mutex_lock(&bc->lock); + bc->requested_to_free += sc->nr_to_scan; + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -851,8 +841,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_nonpcpu--; + bc->freed++; } list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { @@ -864,8 +854,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - freed++; bc->nr_freed_pcpu--; + bc->freed++; } rcu_read_lock(); @@ -884,19 +874,23 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ck = container_of(pos, struct bkey_cached, hash); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - goto next; + bc->skipped_dirty++; } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); - goto next; - } else if (bkey_cached_lock_for_evict(ck)) { + bc->skipped_accessed++; + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); + bc->moved_to_freelist++; + freed++; } scanned++; if (scanned >= nr) break; -next: + pos = next; } @@ -921,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, long nr = atomic_long_read(&bc->nr_keys) - atomic_long_read(&bc->nr_dirty); + /* + * Avoid hammering our shrinker too much if it's nearly empty - the + * shrinker code doesn't take into account how big our cache is, if it's + * mostly empty but the system is under memory pressure it causes nasty + * lock contention: + */ + nr -= 128; + return max(0L, nr); } @@ -1029,22 +1031,56 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; - shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->batch = 1 << 14; + shrink->seeks = 0; shrink->private_data = c; shrinker_register(shrink); return 0; } -void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); - prt_newline(out); - prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); - prt_newline(out); - prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); - prt_newline(out); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + printbuf_tabstop_push(out, 24); + printbuf_tabstop_push(out, 12); + + unsigned flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); + prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); + prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); + prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); + prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); + + prt_printf(out, "\nshrinker:\n"); + prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); + prt_printf(out, "freed:\t%lu\r\n", bc->freed); + prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); + prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); + prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); + prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); + + prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); + + struct bkey_cached *ck; + unsigned iter = 0; + list_for_each_entry(ck, &bc->freed_nonpcpu, list) { + prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + + iter = 0; + list_for_each_entry(ck, &bc->freed_pcpu, list) { + prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); + if (++iter > 10) + break; + } + mutex_unlock(&bc->lock); + memalloc_flags_restore(flags); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 290e4e57df..237e8bb3ac 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -24,6 +24,14 @@ struct btree_key_cache { atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; + + /* shrinker stats */ + unsigned long requested_to_free; + unsigned long freed; + unsigned long moved_to_freelist; + unsigned long skipped_dirty; + unsigned long skipped_accessed; + unsigned long skipped_lock_fail; }; struct bkey_cached_key { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index f2caf49195..c51826fd55 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -83,8 +83,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; - prt_printf(out, "Found lock cycle (%u entries):", g->nr); - prt_newline(out); + prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); for (i = g->g; i < g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); @@ -216,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) if (unlikely(!best)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); @@ -224,15 +224,14 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) bch2_btree_trans_to_text(&buf, trans); - prt_printf(&buf, "backtrace:"); - prt_newline(&buf); + prt_printf(&buf, "backtrace:\n"); printbuf_indent_add(&buf, 2); bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } @@ -492,8 +491,6 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (path->uptodate == BTREE_ITER_NEED_RELOCK) path->uptodate = BTREE_ITER_UPTODATE; - bch2_trans_verify_locks(trans); - return path->uptodate < BTREE_ITER_NEED_RELOCK; } @@ -609,7 +606,9 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_pa { struct get_locks_fail f; - return btree_path_get_locks(trans, path, false, &f); + bool ret = btree_path_get_locks(trans, path, false, &f); + bch2_trans_verify_locks(trans); + return ret; } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -632,7 +631,9 @@ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true, f); + bool ret = btree_path_get_locks(trans, path, true, f); + bch2_trans_verify_locks(trans); + return ret; } bool __bch2_btree_path_upgrade(struct btree_trans *trans, @@ -640,8 +641,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) - return true; + bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); + if (ret) + goto out; /* * XXX: this is ugly - we'd prefer to not be mucking with other @@ -675,8 +677,9 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, btree_path_get_locks(trans, linked, true, NULL); } } - - return false; +out: + bch2_trans_verify_locks(trans); + return ret; } void __bch2_btree_path_downgrade(struct btree_trans *trans, @@ -725,82 +728,98 @@ void bch2_trans_downgrade(struct btree_trans *trans) bch2_btree_path_downgrade(trans, path); } -int bch2_trans_relock(struct btree_trans *trans) +static inline void __bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; unsigned i; - if (unlikely(trans->restarted)) - return -((int) trans->restarted); + trans_for_each_path(trans, path, i) + __bch2_btree_path_unlock(trans, path); +} - trans_for_each_path(trans, path, i) { - struct get_locks_fail f; +static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, + struct get_locks_fail *f, bool trace) +{ + if (!trace) + goto out; - if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) { - if (trace_trans_restart_relock_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", - f.l, path->l[f.l].lock_seq); - if (IS_ERR_OR_NULL(f.b)) { - prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); - } else { - prt_printf(&buf, "%u", f.b->c.lock.seq); - - struct six_lock_count c = - bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); - prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - - c = six_lock_counts(&f.b->c.lock); - prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - } + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); - printbuf_exit(&buf); - } + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + if (IS_ERR_OR_NULL(f->b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); + } else { + prt_printf(&buf, "%u", f->b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); - count_event(trans->c, trans_restart_relock); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + c = six_lock_counts(&f->b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); } - return 0; + count_event(trans->c, trans_restart_relock); +out: + __bch2_trans_unlock(trans); + bch2_trans_verify_locks(trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } -int bch2_trans_relock_notrace(struct btree_trans *trans) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) { - struct btree_path *path; - unsigned i; + bch2_trans_verify_locks(trans); if (unlikely(trans->restarted)) return -((int) trans->restarted); + if (unlikely(trans->locked)) + goto out; + + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; - trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path)) { - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); - } + !btree_path_get_locks(trans, path, false, &f)) + return bch2_trans_relock_fail(trans, path, &f, trace); + } + + trans_set_locked(trans); +out: + bch2_trans_verify_locks(trans); return 0; } +int bch2_trans_relock(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, true); +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + return __bch2_trans_relock(trans, false); +} + void bch2_trans_unlock_noassert(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans_set_unlocked(trans); } void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_path *path; - unsigned i; + __bch2_trans_unlock(trans); - trans_for_each_path(trans, path, i) - __bch2_btree_path_unlock(trans, path); + trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -809,17 +828,6 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); } -bool bch2_trans_locked(struct btree_trans *trans) -{ - struct btree_path *path; - unsigned i; - - trans_for_each_path(trans, path, i) - if (path->nodes_locked) - return true; - return false; -} - int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { @@ -836,15 +844,19 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, void bch2_btree_path_verify_locks(struct btree_path *path) { - unsigned l; + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level) && + !path->nodes_locked); - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); + if (!path->nodes_locked) return; - } - for (l = 0; l < BTREE_MAX_DEPTH; l++) { + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); int have = btree_node_locked_type(path, l); @@ -857,8 +869,24 @@ void bch2_btree_path_verify_locks(struct btree_path *path) } } +static bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + if (path->nodes_locked) + return true; + return false; +} + void bch2_trans_verify_locks(struct btree_trans *trans) { + if (!trans->locked) { + BUG_ON(bch2_trans_locked(trans)); + return; + } + struct btree_path *path; unsigned i; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 4bd72c855d..75a6274c7d 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -193,6 +193,28 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ +static inline void trans_set_locked(struct btree_trans *trans) +{ + if (!trans->locked) { + trans->locked = true; + trans->last_unlock_ip = 0; + + trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; + current->flags |= PF_MEMALLOC_NOFS; + } +} + +static inline void trans_set_unlocked(struct btree_trans *trans) +{ + if (trans->locked) { + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; + + if (!trans->pf_memalloc_nofs) + current->flags &= ~PF_MEMALLOC_NOFS; + } +} + static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, @@ -364,14 +386,14 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f; + struct get_locks_fail f = {}; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->uptodate == BTREE_ITER_UPTODATE) + : path->nodes_locked) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 45cb8149d3..2cb0442f6c 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans, struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); bool ret = !IS_ERR_OR_NULL(b); - if (ret) { - f->sectors_written = b->written; - six_unlock_read(&b->c.lock); - } + if (!ret) + return ret; + + f->sectors_written = b->written; + six_unlock_read(&b->c.lock); /* * We might update this node's range; if that happens, we need the node diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index bbec91e8e6..74e1ff2256 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -19,6 +20,26 @@ #include <linux/prefetch.h> +static const char * const trans_commit_flags_strs[] = { +#define x(n, ...) #n, + BCH_TRANS_COMMIT_FLAGS() +#undef x + NULL +}; + +void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags) +{ + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + + prt_printf(out, "watermark=%s", bch2_watermarks[watermark]); + + flags >>= BCH_WATERMARK_BITS; + if (flags) { + prt_char(out, ' '); + bch2_prt_bitflags(out, trans_commit_flags_strs, flags); + } +} + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -315,8 +336,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + !(i->flags & BTREE_UPDATE_internal_snapshot_node) && + test_bit(JOURNAL_replay_done, &trans->c->journal.flags) && i->k->k.p.snapshot && bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } @@ -443,13 +464,13 @@ static int run_one_mem_trigger(struct btree_trans *trans, verify_update_old_key(trans, i); - if (unlikely(flags & BTREE_TRIGGER_NORUN)) + if (unlikely(flags & BTREE_TRIGGER_norun)) return 0; if (old_ops->trigger == new_ops->trigger) { ret = bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags); } else { ret = bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(new), flags) ?: @@ -472,11 +493,11 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; + unsigned flags = i->flags|BTREE_TRIGGER_transactional; verify_update_old_key(trans, i); - if ((i->flags & BTREE_TRIGGER_NORUN) || + if ((i->flags & BTREE_TRIGGER_norun) || !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) return 0; @@ -486,8 +507,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE|flags) ?: 1; + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; @@ -572,7 +593,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); #endif @@ -590,7 +611,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { - int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc); if (ret) return ret; } @@ -609,6 +630,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned u64s = 0; int ret; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); @@ -686,7 +710,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); if (ret) goto fatal_err; } @@ -705,7 +729,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; verify_update_old_key(trans, i); @@ -766,16 +790,15 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", + prt_printf(err, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); - prt_newline(err); printbuf_indent_add(err, 2); bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); @@ -796,8 +819,7 @@ static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans * struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); - prt_newline(&buf); + prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn); printbuf_indent_add(&buf, 2); bch2_journal_entry_to_text(&buf, c, i); @@ -988,6 +1010,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; + bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_in_restart(trans); + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; @@ -1000,10 +1025,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans_for_each_update(trans, i) { struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, invalid_flags, &buf))) @@ -1018,10 +1043,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) { - enum bkey_invalid_flags invalid_flags = 0; + enum bch_validate_flags invalid_flags = 0; if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; if (unlikely(bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -1065,7 +1090,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - if (i->flags & BTREE_UPDATE_NOJOURNAL) + if (i->flags & BTREE_UPDATE_nojournal) continue; /* we're going to journal the key being updated: */ @@ -1086,6 +1111,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; + bch2_trans_verify_not_unlocked(trans); bch2_trans_verify_not_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c69b233c41..48cb1a7d31 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -163,9 +163,21 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; atomic_t dirty; struct shrinker *shrink; + unsigned used_by_btree[BTREE_ID_NR]; + /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache @@ -187,36 +199,89 @@ struct btree_node_iter { } data[MAX_BSETS]; }; +#define BTREE_ITER_FLAGS() \ + x(slots) \ + x(intent) \ + x(prefetch) \ + x(is_extents) \ + x(not_extents) \ + x(cached) \ + x(with_key_cache) \ + x(with_updates) \ + x(with_journal) \ + x(snapshot_field) \ + x(all_snapshots) \ + x(filter_snapshots) \ + x(nopreserve) \ + x(cached_nofill) \ + x(key_cache_fill) \ + +#define STR_HASH_FLAGS() \ + x(must_create) \ + x(must_replace) + +#define BTREE_UPDATE_FLAGS() \ + x(internal_snapshot_node) \ + x(nojournal) \ + x(key_cache_reclaim) + + /* - * Iterate over all possible positions, synthesizing deleted keys for holes: - */ -static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -/* - * Indicates that intent locks should be taken on leaf nodes, because we expect - * to be doing updates: - */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; -/* - * Causes the btree iterator code to prefetch additional btree nodes from disk: - */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; -/* - * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for - * @pos or the first key strictly greater than @pos + * BTREE_TRIGGER_norun - don't run triggers at all + * + * BTREE_TRIGGER_transactional - we're running transactional triggers as part of + * a transaction commit: triggers may generate new updates + * + * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction + * commit: we have our journal reservation, we're holding btree node write + * locks, and we know the transaction is going to commit (returning an error + * here is a fatal error, causing us to go emergency read-only) + * + * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage + * + * BTREE_TRIGGER_insert - @new is entering the btree + * BTREE_TRIGGER_overwrite - @old is leaving the btree + * + * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc + * trigger */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; -#define __BTREE_ITER_FLAGS_END 15 +#define BTREE_TRIGGER_FLAGS() \ + x(norun) \ + x(transactional) \ + x(atomic) \ + x(check_repair) \ + x(gc) \ + x(insert) \ + x(overwrite) \ + x(is_root) \ + x(bucket_invalidate) + +enum { +#define x(n) BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() + STR_HASH_FLAGS() + BTREE_UPDATE_FLAGS() + BTREE_TRIGGER_FLAGS() +#undef x +}; + +/* iter flags must fit in a u16: */ +//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15); + +enum btree_iter_update_trigger_flags { +#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_ITER_FLAGS() +#undef x +#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + STR_HASH_FLAGS() +#undef x +#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_UPDATE_FLAGS() +#undef x +#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n, + BTREE_TRIGGER_FLAGS() +#undef x +}; enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -307,7 +372,7 @@ struct btree_iter { */ struct bkey k; - /* BTREE_ITER_WITH_JOURNAL: */ + /* BTREE_ITER_with_journal: */ size_t journal_idx; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; @@ -418,6 +483,9 @@ struct btree_trans { u8 lock_must_abort; bool lock_may_not_fail:1; bool srcu_held:1; + bool locked:1; + bool pf_memalloc_nofs:1; + bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -425,13 +493,13 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; - bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; + unsigned long last_unlock_ip; unsigned long srcu_lock_time; const char *fn; @@ -694,13 +762,13 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; - return (1U << type) & mask; + return BIT_ULL(type) & mask; } static inline bool btree_id_is_extents(enum btree_id btree) @@ -710,35 +778,35 @@ static inline bool btree_id_is_extents(enum btree_id btree) static inline bool btree_type_has_snapshots(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_snapshot_field(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } static inline bool btree_type_has_ptrs(enum btree_id id) { - const unsigned mask = 0 + const u64 mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) BCH_BTREE_IDS() #undef x ; - return (1U << id) & mask; + return BIT_ULL(id) & mask; } struct btree_root { diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 8e47e260eb..f3c645a43d 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -25,14 +25,14 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, static int __must_check bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, - struct bkey_i *, enum btree_update_flags, + struct bkey_i *, enum btree_iter_update_trigger_flags, unsigned long ip); static noinline int extent_front_merge(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct bkey_i **insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_i *update; @@ -104,8 +104,8 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, pos.snapshot++; for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { + BTREE_ITER_all_snapshots| + BTREE_ITER_nopreserve, k, ret) { if (!bkey_eq(k.k->p, pos)) break; @@ -138,8 +138,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, darray_init(&s); bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while ((old_k = bch2_btree_iter_prev(&old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { @@ -151,8 +151,8 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, continue; new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bkey_err(new_k); if (ret) break; @@ -168,7 +168,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, update->k.type = KEY_TYPE_whiteout; ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } bch2_trans_iter_exit(trans, &new_iter); @@ -185,7 +185,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, struct bkey_s_c old, struct bkey_s_c new) { @@ -218,7 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -235,7 +235,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ret = bch2_insert_snapshot_whiteouts(trans, btree_id, old.k->p, update->k.p) ?: bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -260,7 +260,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, } ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + BTREE_UPDATE_internal_snapshot_node|flags); if (ret) return ret; } @@ -273,7 +273,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, bch2_cut_front(new.k->p, update); ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| flags, _RET_IP_); if (ret) return ret; @@ -285,7 +285,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, static int bch2_trans_update_extent(struct btree_trans *trans, struct btree_iter *orig_iter, struct bkey_i *insert, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; struct bkey_s_c k; @@ -293,9 +293,9 @@ static int bch2_trans_update_extent(struct btree_trans *trans, int ret = 0; bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; @@ -346,7 +346,7 @@ err: static noinline int flush_new_cached_update(struct btree_trans *trans, struct btree_insert_entry *i, - enum btree_update_flags flags, + enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bkey k; @@ -354,7 +354,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, btree_path_idx_t path_idx = bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); + BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; @@ -372,7 +372,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, goto out; i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; + i->flags |= BTREE_TRIGGER_norun; btree_path_set_should_be_locked(btree_path); ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); @@ -383,7 +383,7 @@ out: static int __must_check bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, - struct bkey_i *k, enum btree_update_flags flags, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; @@ -479,15 +479,15 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); + BTREE_ITER_intent| + BTREE_ITER_cached, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, + iter->flags & BTREE_ITER_intent, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached); if (unlikely(ret)) return ret; @@ -505,17 +505,17 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, } int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; - if (iter->flags & BTREE_ITER_IS_EXTENTS) + if (iter->flags & BTREE_ITER_is_extents) return bch2_trans_update_extent(trans, iter, k, flags); if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + !(flags & BTREE_UPDATE_key_cache_reclaim) && + (iter->flags & BTREE_ITER_filter_snapshots)) { ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); if (unlikely(ret < 0)) return ret; @@ -528,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter * Ensure that updates to cached btrees go to the key cache: */ struct btree_path *path = trans->paths + path_idx; - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + if (!(flags & BTREE_UPDATE_key_cache_reclaim) && !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { @@ -587,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); k = bch2_btree_iter_prev(iter); ret = bkey_err(k); if (ret) @@ -621,15 +621,15 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int bch2_btree_insert_nonextent(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_not_extents| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); @@ -637,16 +637,13 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, } int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - int ret; - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); + BTREE_ITER_intent|flags); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -698,8 +695,8 @@ int bch2_btree_delete(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, btree, pos, - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -717,7 +714,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bkey_s_c k; int ret = 0; - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); @@ -745,7 +742,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -804,7 +801,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, k->k.p = pos; struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, k, 0); @@ -852,7 +849,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, if (ret) goto err; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + if (!test_bit(JOURNAL_running, &c->journal.flags)) { ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); if (ret) goto err; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index cc7c53e83f..b4894e4d54 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -44,16 +44,18 @@ enum bch_trans_commit_flags { #undef x }; +void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, - enum btree_update_flags); + enum btree_iter_update_trigger_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, int flags); @@ -94,14 +96,14 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, - enum btree_update_flags, + enum btree_iter_update_trigger_flags, struct bkey_s_c, struct bkey_s_c); int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_iter_update_trigger_flags); struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); @@ -276,7 +278,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type); + btree_id, pos, flags|BTREE_ITER_intent, type); struct bkey_i *ret = IS_ERR(k.k) ? ERR_CAST(k.k) : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); @@ -299,7 +301,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, unsigned flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, - btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); int ret; if (IS_ERR(mut)) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b4efd8cc4d..60b8544cea 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -38,22 +38,6 @@ static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) -{ - btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, - BTREE_ITER_NOPRESERVE| - BTREE_ITER_INTENT, _RET_IP_); - path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); - - struct btree_path *path = trans->paths + path_idx; - bch2_btree_path_downgrade(trans, path); - __bch2_btree_path_unlock(trans, path); - return path_idx; -} - /* * Verify that child nodes correctly span parent node's range: */ @@ -73,6 +57,24 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + need_fsck_err(c, btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf); + goto topology_repair; + } + + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + need_fsck_err(c, btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf); + goto topology_repair; + } + } + if (!b->c.level) return 0; @@ -158,7 +160,6 @@ topology_repair: static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) { struct bkey_packed *k; - struct bset_tree *t; struct bkey uk; for_each_bset(b, t) @@ -646,7 +647,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -655,7 +656,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -735,9 +736,6 @@ err: */ b = READ_ONCE(as->b); if (b) { - btree_path_idx_t path_idx = get_unlocked_mut_path(trans, - as->btree_id, b->c.level, b->key.k.p); - struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -755,12 +753,16 @@ err: * btree_node_lock_nopath() (the use of which is always suspect, * we need to work on removing this in the future) * - * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() * calls bch2_path_upgrade(), before we call path_make_mut(), so * we may rarely end up with a locked path besides the one we * have here: */ bch2_trans_unlock(trans); + bch2_trans_begin(trans); + btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); @@ -1154,13 +1156,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags |= watermark; if (watermark < BCH_WATERMARK_reclaim && - test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + test_bit(JOURNAL_space_low, &c->journal.flags)) { if (flags & BCH_TRANS_COMMIT_journal_reclaim) return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - bch2_trans_unlock(trans); - wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); - ret = bch2_trans_relock(trans); + ret = drop_locks_do(trans, + ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); if (ret) return ERR_PTR(ret); } @@ -1206,7 +1207,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->start_time = start_time; as->ip_started = _RET_IP_; as->mode = BTREE_UPDATE_none; - as->watermark = watermark; + as->flags = flags; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level_start = level_start; @@ -1360,7 +1361,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), @@ -1619,12 +1620,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); + path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path2, n2); @@ -1669,7 +1670,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); + path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + path1, n1); @@ -1947,6 +1948,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); @@ -1979,7 +1982,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, : bpos_successor(b->data->max_key); sib_path = bch2_path_get(trans, btree, sib_pos, - U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; @@ -2072,7 +2075,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2150,7 +2153,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); bch2_btree_path_level_init(trans, trans->paths + new_path, n); @@ -2333,10 +2336,10 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (!skip_triggers) { ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key), - BTREE_TRIGGER_TRANSACTIONAL) ?: + BTREE_TRIGGER_transactional) ?: bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, bkey_i_to_s(new_key), - BTREE_TRIGGER_TRANSACTIONAL); + BTREE_TRIGGER_transactional); if (ret) return ret; } @@ -2353,7 +2356,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bch2_trans_copy_iter(&iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_INTENT, + iter2.flags & BTREE_ITER_intent, _THIS_IP_); struct btree_path *path2 = btree_iter_path(trans, &iter2); @@ -2365,7 +2368,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: - bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; } else { @@ -2473,7 +2476,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -2487,7 +2490,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); @@ -2511,7 +2513,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) +int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2559,17 +2561,18 @@ static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); + bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level)); } static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) { - prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, + prt_printf(out, "%ps: ", (void *) as->ip_started); + bch2_trans_commit_flags_to_text(out, as->flags); + + prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_id_str(as->btree_id), as->update_level_start, as->update_level_end, - bch2_watermarks[as->watermark], bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c1a479ebaa..b5b76ce01c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -52,7 +52,7 @@ struct btree_update { struct list_head unwritten_list; enum btree_update_mode mode; - enum bch_watermark watermark; + enum bch_trans_commit_flags flags; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -144,6 +144,9 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); + if (bch2_btree_node_merging_disabled) + return 0; + b = path->l[level].b; if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; @@ -172,6 +175,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); + +int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned); void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 36a6f42aba..d0e92d9480 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "error.h" +#include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -122,7 +124,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| @@ -191,13 +193,13 @@ btree_write_buffered_insert(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); + BTREE_ITER_cached|BTREE_ITER_intent); trans->journal_res.seq = wb->journal_seq; ret = bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(trans, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -332,7 +334,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, - BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_intent|BTREE_ITER_all_snapshots); } bch2_btree_iter_set_pos(&iter, k->k.k.p); @@ -492,6 +494,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } +/** + * In check and repair code, when checking references to write buffer btrees we + * need to issue a flush before we have a definitive error: this issues a flush + * if this is a key we haven't yet checked. + */ +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_buf tmp; + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index eebcd2b152..dd5e64218b 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -23,6 +23,9 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); +struct bkey_buf; +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); + struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; size_t room; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 82f1792588..314ee3e018 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -274,25 +274,14 @@ void bch2_dev_usage_init(struct bch_dev *ca) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { bch2_prt_data_type(out, i); - prt_tab(out); - prt_u64(out, usage->d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, usage->d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\t%llu\r%llu\r%llu\r\n", + usage->d[i].buckets, + usage->d[i].sectors, + usage->d[i].fragmented); } } @@ -329,26 +318,6 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) -{ - return (struct bch_alloc_v4) { - .gen = b.gen, - .data_type = b.data_type, - .dirty_sectors = b.dirty_sectors, - .cached_sectors = b.cached_sectors, - .stripe = b.stripe, - }; -} - -void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket *old, struct bucket *new) -{ - struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); - struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); - - bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); -} - static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry_v1 *r, @@ -496,78 +465,305 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 return bch2_update_replicas_list(trans, &r.e, sectors); } -int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, struct gc_pos pos, - unsigned flags) +static int bch2_check_fix_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + bool *do_update) { - struct bucket old, new, *g; + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - BUG_ON(data_type != BCH_DATA_sb && - data_type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (!ca) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to missing device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; return 0; + } - percpu_down_read(&c->mark_lock); - g = gc_bucket(ca, b); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + if (!g) { + if (fsck_err(c, ptr_to_invalid_device, + "pointer to invalid bucket on device %u\n" + "while marking %s", + p.ptr.dev, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + goto out; + } - bucket_lock(g); - old = *g; + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - if (bch2_fs_inconsistent_on(g->data_type && - g->data_type != data_type, c, - "different types of data in same bucket: %s, %s", + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + *do_update = true; + } + } + + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } + } + + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k.k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + goto out; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + c, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_type_str(g->data_type), - bch2_data_type_str(data_type))) { - ret = -EIO; - goto err; + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = data_type; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } else { + *do_update = true; + } } - if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", - ca->dev_idx, b, g->gen, - bch2_data_type_str(g->data_type ?: data_type), - g->dirty_sectors, sectors)) { - ret = -EIO; - goto err; + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, + c, ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), + c, ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + } +out: +fsck_err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; +} + +int bch2_check_fix_ptrs(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; + + percpu_down_read(&c->mark_lock); + + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) + goto err; } - g->data_type = data_type; - g->dirty_sectors += sectors; - new = *g; + if (do_update) { + if (flags & BTREE_TRIGGER_is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } + + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + rcu_read_lock(); + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev)); + rcu_read_unlock(); + + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->gen; + } + rcu_read_unlock(); + } else { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } + rcu_read_unlock(); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } + + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } + + percpu_up_read(&c->mark_lock); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + percpu_down_read(&c->mark_lock); + + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); return ret; } -int bch2_check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 bucket_sectors) +int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 *bucket_sectors) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); struct printbuf buf = PRINTBUF; + bool inserting = sectors > 0; int ret = 0; - if (bucket_data_type == BCH_DATA_cached) - bucket_data_type = BCH_DATA_user; - - if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || - (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) - bucket_data_type = ptr_data_type = BCH_DATA_stripe; + BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, @@ -578,8 +774,9 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { @@ -592,33 +789,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans, ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; + } + + if (b_gen != ptr->gen && ptr->cached) { + ret = 1; + goto out; } - if (b_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - *bucket_gen(ca, bucket_nr), + bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; - } - - if (b_gen != ptr->gen) { - ret = 1; + if (inserting) + goto err; goto out; } - if (!data_type_is_empty(bucket_data_type) && - ptr_data_type && - bucket_data_type != ptr_data_type) { + if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" @@ -628,28 +825,33 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + goto out; } - if ((u64) bucket_sectors + sectors > U32_MAX) { + if ((u64) *bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, BCH_FSCK_ERR_bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - bucket_sectors, sectors, + *bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; - goto err; + if (inserting) + goto err; + sectors = -*bucket_sectors; } + + *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; err: bch2_dump_trans_updates(trans); + ret = -EIO; goto out; } @@ -786,29 +988,22 @@ need_mark: /* KEY_TYPE_extent: */ -static int __mark_pointer(struct btree_trans *trans, +static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) + struct bch_alloc_v4 *a) { u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, *dst_sectors); + ? &a->dirty_sectors + : &a->cached_sectors; + int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type, + a->gen, a->data_type, dst_sectors); if (ret) return ret; - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -816,81 +1011,79 @@ static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - s64 *sectors, unsigned flags) + s64 *sectors, + enum btree_iter_update_trigger_flags flags) { - bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); + bool insert = !(flags & BTREE_TRIGGER_overwrite); + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (unlikely(!ca)) { + if (insert) + ret = -EIO; + goto err; + } + struct bpos bucket; struct bch_backpointer bp; - - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { - struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) - return ret; - - ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); - + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v); if (ret) - return ret; + goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); if (ret) - return ret; + goto err; } } - if (flags & BTREE_TRIGGER_GC) { - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); - struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - bucket_lock(g); - struct bucket old = *g; - - u8 bucket_data_type = g->data_type; - int ret = __mark_pointer(trans, k, &p.ptr, *sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (ret) { - bucket_unlock(g); - percpu_up_read(&c->mark_lock); - return ret; + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + goto err_unlock; } - g->data_type = bucket_data_type; - struct bucket new = *g; + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + } bucket_unlock(g); - bch2_dev_usage_update_m(c, ca, &old, &new); +err_unlock: percpu_up_read(&c->mark_lock); } - - return 0; +err: + bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; } static int bch2_trigger_stripe_ptr(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, enum bch_data_type data_type, - s64 sectors, unsigned flags) + s64 sectors, + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); + BTREE_ITER_with_updates, stripe); int ret = PTR_ERR_OR_ZERO(s); if (unlikely(ret)) { bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, @@ -920,10 +1113,10 @@ err: return ret; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(!(flags & BTREE_TRIGGER_gc)); struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { @@ -959,9 +1152,10 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { - bool gc = flags & BTREE_TRIGGER_GC; + bool gc = flags & BTREE_TRIGGER_gc; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -970,7 +1164,7 @@ static int __trigger_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 dirty_sectors = 0; + s64 replicas_sectors = 0; int ret = 0; r.e.data_type = data_type; @@ -978,7 +1172,7 @@ static int __trigger_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors; + s64 disk_sectors = 0; ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; @@ -996,7 +1190,7 @@ static int __trigger_extent(struct btree_trans *trans, return ret; } } else if (!p.has_ec) { - dirty_sectors += disk_sectors; + replicas_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -1014,8 +1208,8 @@ static int __trigger_extent(struct btree_trans *trans, if (r.e.nr_devs) { ret = !gc - ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) - : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + ? bch2_update_replicas_list(trans, &r.e, replicas_sectors) + : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true); if (unlikely(ret && gc)) { struct printbuf buf = PRINTBUF; @@ -1031,15 +1225,18 @@ static int __trigger_extent(struct btree_trans *trans, } int bch2_trigger_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags); + /* if pointers aren't changing - nothing to do: */ if (new_ptrs_bytes == old_ptrs_bytes && !memcmp(new_ptrs.start, @@ -1047,7 +1244,7 @@ int bch2_trigger_extent(struct btree_trans *trans, new_ptrs_bytes)) return 0; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bch_fs *c = trans->c; int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - (int) bch2_bkey_needs_rebalance(c, old); @@ -1060,8 +1257,8 @@ int bch2_trigger_extent(struct btree_trans *trans, } } - if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) - return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags); return 0; } @@ -1069,17 +1266,17 @@ int bch2_trigger_extent(struct btree_trans *trans, /* KEY_TYPE_reservation */ static int __trigger_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size * replicas; - if (flags & BTREE_TRIGGER_OVERWRITE) + if (flags & BTREE_TRIGGER_overwrite) sectors = -sectors; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) return ret; @@ -1090,7 +1287,7 @@ static int __trigger_reservation(struct btree_trans *trans, d->persistent_reserved[replicas - 1] += sectors; } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { percpu_down_read(&c->mark_lock); preempt_disable(); @@ -1110,7 +1307,7 @@ static int __trigger_reservation(struct btree_trans *trans, int bch2_trigger_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } @@ -1118,22 +1315,16 @@ int bch2_trigger_reservation(struct btree_trans *trans, /* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, + struct bch_dev *ca, u64 b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_i_alloc_v4 *a; int ret = 0; - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return 0; - - a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b)); if (IS_ERR(a)) return PTR_ERR(a); @@ -1161,20 +1352,74 @@ err: return ret; } +static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 b, enum bch_data_type data_type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) +{ + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) + goto err_unlock; + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g); + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) + goto err; + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_type_str(g->data_type ?: data_type), + g->dirty_sectors, sectors)) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + percpu_up_read(&c->mark_lock); + return 0; +err: + bucket_unlock(g); +err_unlock: + percpu_up_read(&c->mark_lock); + return -EIO; +} + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, - struct bch_dev *ca, size_t b, - enum bch_data_type type, - unsigned sectors) + struct bch_dev *ca, u64 b, + enum bch_data_type type, unsigned sectors, + enum btree_iter_update_trigger_flags flags) { - return commit_do(trans, NULL, NULL, 0, - __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + BUG_ON(type != BCH_DATA_free && + type != BCH_DATA_sb && + type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + if (flags & BTREE_TRIGGER_gc) + return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags); + else if (flags & BTREE_TRIGGER_transactional) + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); + else + BUG(); } static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, - struct bch_dev *ca, - u64 start, u64 end, - enum bch_data_type type, - u64 *bucket, unsigned *bucket_sectors) + struct bch_dev *ca, u64 start, u64 end, + enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors, + enum btree_iter_update_trigger_flags flags) { do { u64 b = sector_to_bucket(ca, start); @@ -1183,7 +1428,7 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, if (b != *bucket && *bucket_sectors) { int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, - type, *bucket_sectors); + type, *bucket_sectors, flags); if (ret) return ret; @@ -1198,8 +1443,8 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, - struct bch_dev *ca) +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; u64 bucket = 0; @@ -1212,21 +1457,21 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, if (offset == BCH_SB_SECTOR) { ret = bch2_trans_mark_metadata_sectors(trans, ca, 0, BCH_SB_SECTOR, - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, offset + (1 << layout->sb_max_size_bits), - BCH_DATA_sb, &bucket, &bucket_sectors); + BCH_DATA_sb, &bucket, &bucket_sectors, flags); if (ret) return ret; } if (bucket_sectors) { ret = bch2_trans_mark_metadata_bucket(trans, ca, - bucket, BCH_DATA_sb, bucket_sectors); + bucket, BCH_DATA_sb, bucket_sectors, flags); if (ret) return ret; } @@ -1234,7 +1479,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, for (i = 0; i < ca->journal.nr; i++) { ret = bch2_trans_mark_metadata_bucket(trans, ca, ca->journal.buckets[i], - BCH_DATA_journal, ca->mi.bucket_size); + BCH_DATA_journal, ca->mi.bucket_size, flags); if (ret) return ret; } @@ -1242,20 +1487,22 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, return 0; } -int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) { - int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - + int ret = bch2_trans_run(c, + __bch2_trans_mark_dev_sb(trans, ca, flags)); bch_err_fn(c, ret); return ret; } -int bch2_trans_mark_dev_sbs(struct bch_fs *c) +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, + enum btree_iter_update_trigger_flags flags) { for_each_online_member(c, ca) { - int ret = bch2_trans_mark_dev_sb(c, ca); + int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->ref); + percpu_ref_put(&ca->io_ref); return ret; } } @@ -1263,6 +1510,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return 0; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 @@ -1331,6 +1583,31 @@ recalculate: /* Startup/shutdown: */ +void bch2_buckets_nouse_free(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + kvfree_rcu_mightsleep(ca->buckets_nouse); + ca->buckets_nouse = NULL; + } +} + +int bch2_buckets_nouse_alloc(struct bch_fs *c) +{ + for_each_member_device(c, ca) { + BUG_ON(ca->buckets_nouse); + + ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { + bch2_dev_put(ca); + return -BCH_ERR_ENOMEM_buckets_nouse; + } + } + + return 0; +} + static void bucket_gens_free_rcu(struct rcu_head *rcu) { struct bucket_gens *buckets = @@ -1342,26 +1619,21 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; - unsigned long *buckets_nouse = NULL; bool resize = ca->bucket_gens != NULL; int ret; + BUG_ON(resize && ca->buckets_nouse); + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } - if ((c->opts.buckets_nouse && - !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { - ret = -BCH_ERR_ENOMEM_buckets_nouse; - goto err; - } - bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; if (resize) { down_write(&c->gc_lock); @@ -1377,17 +1649,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(bucket_gens->b, old_bucket_gens->b, n); - if (buckets_nouse) - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; - swap(ca->buckets_nouse, buckets_nouse); - nbuckets = ca->mi.nbuckets; if (resize) { @@ -1398,7 +1664,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index f9af5adabe..8ad4be7386 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -12,7 +12,7 @@ #include "extents.h" #include "sb-members.h" -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s) { return div_u64(s, ca->mi.bucket_size); } @@ -30,8 +30,7 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) return remainder; } -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) +static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset) { return div_u64_rem(s, ca->mi.bucket_size, offset); } @@ -94,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { struct bucket_array *buckets = gc_bucket_array(ca); - BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) + return NULL; return buckets->b + b; } @@ -111,30 +111,35 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) { struct bucket_gens *gens = bucket_gens(ca); - BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + if (b - gens->first_bucket >= gens->nbuckets_minus_first) + return NULL; return gens->b + b; } +static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b) +{ + rcu_read_lock(); + u8 gen = *bucket_gen(ca, b); + rcu_read_unlock(); + return gen; +} + static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { return sector_to_bucket(ca, ptr->offset); } -static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); } -static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca, const struct bch_extent_ptr *ptr, u32 *bucket_offset) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); } @@ -175,17 +180,22 @@ static inline int gen_after(u8 a, u8 b) return r > 0 ? r : 0; } +static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) +{ + u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); + if (!gen) + return -1; + return gen_after(*gen, ptr->gen); +} + /** - * ptr_stale() - check if a pointer points into a bucket that has been + * dev_ptr_stale() - check if a pointer points into a bucket that has been * invalidated. */ -static inline u8 ptr_stale(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 ret; - rcu_read_lock(); - ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); return ret; @@ -306,8 +316,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, const struct bch_alloc_v4 *, const struct bch_alloc_v4 *, u64, bool); -void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, - struct bucket *, struct bucket *); /* key/bucket marking: */ @@ -333,27 +341,29 @@ int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); -int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, - const struct bch_extent_ptr *, - s64, enum bch_data_type, u8, u8, u32); +int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *, + struct bkey_s_c, const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32 *); -int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, - size_t, enum bch_data_type, unsigned, - struct gc_pos, unsigned); +int bch2_check_fix_ptrs(struct btree_trans *, + enum btree_id, unsigned, struct bkey_s_c, + enum btree_iter_update_trigger_flags); int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ - ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\ ret; \ }) @@ -362,9 +372,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *); void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, - size_t, enum bch_data_type, unsigned); -int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64, + enum bch_data_type, unsigned, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *, + enum btree_iter_update_trigger_flags); +int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, + enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) @@ -464,6 +478,9 @@ static inline u64 avail_factor(u64 r) return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } +void bch2_buckets_nouse_free(struct bch_fs *); +int bch2_buckets_nouse_alloc(struct bch_fs *); + int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); void bch2_dev_buckets_free(struct bch_dev *); int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 6a31740222..f636e17c4c 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -22,6 +22,7 @@ struct bucket_array { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; struct bucket b[]; }; @@ -29,6 +30,7 @@ struct bucket_gens { struct rcu_head rcu; u16 first_bucket; size_t nbuckets; + size_t nbuckets_minus_first; u8 b[]; }; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4d14f19f51..6d82e1165a 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -32,12 +32,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, if (dev >= c->sb.nr_devices) return ERR_PTR(-EINVAL); - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - + ca = bch2_dev_tryget_noerror(c, dev); if (!ca) return ERR_PTR(-EINVAL); } else { @@ -221,7 +216,8 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(NULL, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; @@ -324,7 +320,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) return ret; ret = bch2_dev_add(c, path); - kfree(path); + if (!IS_ERR(path)) + kfree(path); return ret; } @@ -391,7 +388,7 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) return PTR_ERR(ca); ret = bch2_dev_offline(c, ca, arg.flags); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -420,7 +417,7 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, if (ret) bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -615,7 +612,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.d[i].fragmented = src.d[i].fragmented; } - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); } @@ -667,7 +664,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, goto err; } err: - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -689,11 +686,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, if (arg.flags & BCH_READ_DEV) { ca = bch2_device_lookup(c, arg.dev, arg.flags); - - if (IS_ERR(ca)) { - ret = PTR_ERR(ca); - goto err; - } + ret = PTR_ERR_OR_ZERO(ca); + if (ret) + goto err_unlock; sb = ca->disk_sb.sb; } else { @@ -708,8 +703,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, vstruct_bytes(sb)); err: - if (!IS_ERR_OR_NULL(ca)) - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); +err_unlock: mutex_unlock(&c->sb_lock); return ret; } @@ -753,7 +748,7 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, ret = bch2_dev_resize(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -779,7 +774,7 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return ret; } @@ -857,7 +852,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, ret = PTR_ERR_OR_ZERO(optstr) ?: bch2_parse_mount_opts(c, &thr->opts, optstr); - kfree(optstr); + if (!IS_ERR(optstr)) + kfree(optstr); if (ret) goto err; @@ -961,7 +957,9 @@ static const struct file_operations bch_chardev_fops = { }; static int bch_chardev_major; -static struct class *bch_chardev_class; +static const struct class bch_chardev_class = { + .name = "bcachefs", +}; static struct device *bch_chardev; void bch2_fs_chardev_exit(struct bch_fs *c) @@ -978,7 +976,7 @@ int bch2_fs_chardev_init(struct bch_fs *c) if (c->minor < 0) return c->minor; - c->chardev = device_create(bch_chardev_class, NULL, + c->chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, c->minor), c, "bcachefs%u-ctl", c->minor); if (IS_ERR(c->chardev)) @@ -989,32 +987,39 @@ int bch2_fs_chardev_init(struct bch_fs *c) void bch2_chardev_exit(void) { - if (!IS_ERR_OR_NULL(bch_chardev_class)) - device_destroy(bch_chardev_class, - MKDEV(bch_chardev_major, U8_MAX)); - if (!IS_ERR_OR_NULL(bch_chardev_class)) - class_destroy(bch_chardev_class); + device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX)); + class_unregister(&bch_chardev_class); if (bch_chardev_major > 0) unregister_chrdev(bch_chardev_major, "bcachefs"); } int __init bch2_chardev_init(void) { + int ret; + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); if (bch_chardev_major < 0) return bch_chardev_major; - bch_chardev_class = class_create("bcachefs"); - if (IS_ERR(bch_chardev_class)) - return PTR_ERR(bch_chardev_class); + ret = class_register(&bch_chardev_class); + if (ret) + goto major_out; - bch_chardev = device_create(bch_chardev_class, NULL, + bch_chardev = device_create(&bch_chardev_class, NULL, MKDEV(bch_chardev_major, U8_MAX), NULL, "bcachefs-ctl"); - if (IS_ERR(bch_chardev)) - return PTR_ERR(bch_chardev); + if (IS_ERR(bch_chardev)) { + ret = PTR_ERR(bch_chardev); + goto class_out; + } return 0; + +class_out: + class_unregister(&bch_chardev_class); +major_out: + unregister_chrdev(bch_chardev_major, "bcachefs-ctl"); + return ret; } #endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 088fd2e7bd..3bd3aba90d 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -233,7 +233,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, return ret; } default: - BUG(); + return (struct bch_csum) {}; } } @@ -307,7 +307,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, return ret; } default: - BUG(); + return (struct bch_csum) {}; } } @@ -352,8 +352,12 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, bytes += bv.bv_len; } - sg_mark_end(sg - 1); - return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (sg != sgl) { + sg_mark_end(sg - 1); + return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + } + + return ret; } struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, @@ -469,9 +473,8 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, /* BCH_SB_FIELD_crypt: */ -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); @@ -494,14 +497,10 @@ static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); + prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt)); + prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt)); + prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt)); + prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt)); } const struct bch_sb_field_ops bch_sb_field_ops_crypt = { @@ -653,26 +652,26 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { - int ret; - - if (!c->chacha20) - c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); - ret = PTR_ERR_OR_ZERO(c->chacha20); + if (c->chacha20) + return 0; + struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); + int ret = PTR_ERR_OR_ZERO(chacha20); if (ret) { bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); return ret; } - if (!c->poly1305) - c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); - ret = PTR_ERR_OR_ZERO(c->poly1305); - + struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); + ret = PTR_ERR_OR_ZERO(poly1305); if (ret) { bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); + crypto_free_sync_skcipher(chacha20); return ret; } + c->chacha20 = chacha20; + c->poly1305 = poly1305; return 0; } @@ -767,11 +766,11 @@ err: void bch2_fs_encryption_exit(struct bch_fs *c) { - if (!IS_ERR_OR_NULL(c->poly1305)) + if (c->poly1305) crypto_free_shash(c->poly1305); - if (!IS_ERR_OR_NULL(c->chacha20)) + if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (!IS_ERR_OR_NULL(c->sha256)) + if (c->sha256) crypto_free_shash(c->sha256); } @@ -784,6 +783,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) c->sha256 = crypto_alloc_shash("sha256", 0, 0); ret = PTR_ERR_OR_ZERO(c->sha256); if (ret) { + c->sha256 = NULL; bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); goto out; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 3636444511..0f40b585ce 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -132,14 +132,9 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, { struct io_timer *ret = NULL; - spin_lock(&clock->timer_lock); - if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) heap_pop(&clock->timers, ret, io_timer_cmp, NULL); - - spin_unlock(&clock->timer_lock); - return ret; } @@ -148,8 +143,10 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) struct io_timer *timer; unsigned long now = atomic64_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); + spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0022b51ce3..0087b8555e 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -5,7 +5,9 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "data_update.h" +#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -106,7 +108,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (1) { struct bkey_s_c k; @@ -202,6 +204,7 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ + rcu_read_lock(); restart_drop_extra_replicas: bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); @@ -214,6 +217,7 @@ restart_drop_extra_replicas: goto restart_drop_extra_replicas; } } + rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -288,7 +292,7 @@ restart_drop_extra_replicas: k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -357,10 +361,11 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (c->opts.nocow_enabled) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + PTR_BUCKET_POS(ca, ptr), 0); + bch2_dev_put(ca); } bch2_bkey_buf_exit(&update->k, c); @@ -386,8 +391,10 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, while (bio_sectors(bio)) { unsigned sectors = bio_sectors(bio); + bch2_trans_begin(trans); + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, - BTREE_ITER_SLOTS); + BTREE_ITER_slots); ret = lockrestart_do(trans, ({ k = bch2_btree_iter_peek_slot(&iter); bkey_err(k); @@ -449,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, } } +void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression:\t"); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); +} + +void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -465,7 +504,6 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); data_opts.kill_ptrs ^= 1U << drop; @@ -480,15 +518,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ - if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) n->k.size = 0; return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -539,15 +577,26 @@ int bch2_data_update_init(struct btree_trans *trans, m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; - bkey_for_each_ptr(ptrs, ptr) - percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + bkey_for_each_ptr(ptrs, ptr) { + if (!bch2_dev_tryget(c, ptr->dev)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); + } + return -BCH_ERR_data_update_done; + } + } unsigned durability_have = 0, durability_removing = 0; i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); bool locked; + rcu_read_lock(); if (((1U << i) & m->data_opts.rewrite_ptrs)) { BUG_ON(p.ptr.cached); @@ -561,6 +610,7 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); /* * op->csum_type is normally initialized from the fs/file's @@ -579,15 +629,13 @@ int bch2_data_update_init(struct btree_trans *trans, if (ctxt) { move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) || + bucket, 0)) || list_empty(&ctxt->ios)); if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0)) { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { ret = -BCH_ERR_nocow_lock_blocked; goto err; } @@ -629,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + if (!m->op.nr_replicas) { + struct printbuf buf = PRINTBUF; + + bch2_data_update_to_text(&buf, m); + WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_data_update_done; + goto done; + } + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { @@ -649,10 +707,11 @@ int bch2_data_update_init(struct btree_trans *trans, err: i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if ((1U << i) & ptrs_locked) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); - percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + bch2_dev_put(ca); i++; } diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 991095bbd4..8d36365bde 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -17,6 +17,9 @@ struct data_update_opts { unsigned write_flags; }; +void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_opts *, struct data_update_opts *); + struct data_update { /* extent being updated: */ enum btree_id btree_id; @@ -27,6 +30,8 @@ struct data_update { struct bch_write_op op; }; +void bch2_data_update_to_text(struct printbuf *, struct data_update *); + int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index cd99b73994..ebabab171f 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -37,11 +37,11 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_ondisk = c->verify_ondisk; struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; - struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; bool failed = false, saw_error = false; - if (!bch2_dev_get_ioref(ca, READ)) + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, @@ -194,8 +194,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!bch2_dev_get_ioref(ca, READ)) { + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; } @@ -375,8 +375,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); bch2_trans_unlock(trans); @@ -459,8 +459,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ struct btree_path_level *l = &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = @@ -492,51 +492,26 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u ", - b, - bch2_btree_id_str(b->c.btree_id), - b->c.level); - prt_newline(out); + prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); printbuf_indent_add(out, 2); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); - prt_printf(out, "flags: "); - prt_tab(out); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_btree_node_flags, b->flags); prt_newline(out); - prt_printf(out, "pcpu read locks: "); - prt_tab(out); - prt_printf(out, "%u", b->c.lock.readers != NULL); - prt_newline(out); - - prt_printf(out, "written:"); - prt_tab(out); - prt_printf(out, "%u", b->written); - prt_newline(out); - - prt_printf(out, "writes blocked:"); - prt_tab(out); - prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); - prt_newline(out); - - prt_printf(out, "will make reachable:"); - prt_tab(out); - prt_printf(out, "%lx", b->will_make_reachable); - prt_newline(out); - - prt_printf(out, "journal pin %px:", &b->writes[0].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[0].journal.seq); - prt_newline(out); + prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); + prt_printf(out, "written:\t%u\n", b->written); + prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); + prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); - prt_printf(out, "journal pin %px:", &b->writes[1].journal); - prt_tab(out); - prt_printf(out, "%llu", b->writes[1].journal.seq); - prt_newline(out); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[0].journal, b->writes[0].journal.seq); + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[1].journal, b->writes[1].journal.seq); printbuf_indent_sub(out, 2); } @@ -593,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -600,42 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= i->iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + i->iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + if (!closure_get_not_zero(&trans->ref)) + continue; + + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); - prt_printf(&i->buf, "backtrace:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -782,25 +780,20 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, !bch2_btree_transaction_fns[i->iter]) break; - prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); - prt_newline(&i->buf); + prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); mutex_lock(&s->lock); - prt_printf(&i->buf, "Max mem used: %u", s->max_mem); - prt_newline(&i->buf); - - prt_printf(&i->buf, "Transaction duration:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->duration); printbuf_indent_sub(&i->buf, 2); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { - prt_printf(&i->buf, "Lock hold times:"); - prt_newline(&i->buf); + prt_printf(&i->buf, "Lock hold times:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); @@ -808,8 +801,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, } if (s->max_paths_text) { - prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); - prt_newline(&i->buf); + prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); printbuf_indent_add(&i->buf, 2); prt_str_indented(&i->buf, s->max_paths_text); @@ -836,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - u32 seq; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= iter) continue; - closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + iter = (ulong) trans; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + if (!closure_get_not_zero(&trans->ref)) + continue; - bch2_check_for_deadlock(trans, &i->buf); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d37bd07afb..c67460d820 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -15,6 +15,9 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { + if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) + return 0; + unsigned bkey_u64s = bkey_val_u64s(d.k); unsigned bkey_bytes = bkey_u64s * sizeof(u64); u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; @@ -98,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); @@ -118,7 +121,7 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); @@ -205,7 +208,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; @@ -220,9 +223,8 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + dir_inum, snapshot, &dirent->k_i, + flags|BTREE_UPDATE_internal_snapshot_node); *dir_offset = dirent->k.p.offset; return ret; @@ -232,7 +234,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; @@ -243,7 +245,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, str_hash_flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -272,7 +274,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); target->inum = le64_to_cpu(s.inode); } @@ -301,13 +303,9 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ - ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_src = bch2_btree_iter_peek_slot(&src_iter); + old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) goto out; @@ -329,13 +327,9 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; } else { - ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - if (ret) - goto out; - - old_dst = bch2_btree_iter_peek_slot(&dst_iter); + old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) goto out; @@ -450,7 +444,7 @@ out_set_src: if (delete_src) { bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ret = bch2_btree_iter_traverse(&src_iter) ?: - bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -458,7 +452,7 @@ out_set_src: if (delete_dst) { bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); ret = bch2_btree_iter_traverse(&dst_iter) ?: - bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } @@ -479,13 +473,9 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { - int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - if (ret) - return ret; - - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + int ret = bkey_err(k); if (ret) goto err; @@ -541,16 +531,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } +static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target) +{ + struct qstr name = bch2_dirent_get_name(d); + bool ret = dir_emit(ctx, name.name, + name.len, + target.inum, + vfs_d_type(d.v->d_type)); + if (ret) + ctx->pos = d.k->p.offset + 1; + return ret; +} + int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_dirent dirent; subvol_inum target; u32 snapshot; struct bkey_buf sk; - struct qstr name; int ret; bch2_bkey_buf_init(&sk); @@ -567,7 +567,9 @@ retry: if (k.k->type != KEY_TYPE_dirent) continue; - dirent = bkey_s_c_to_dirent(k); + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); ret = bch2_dirent_read_target(trans, inum, dirent, &target); if (ret < 0) @@ -575,28 +577,22 @@ retry: if (ret) continue; - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - dirent = bkey_i_to_s_c_dirent(sk.k); - bch2_trans_unlock(trans); - - name = bch2_dirent_get_name(dirent); - - ctx->pos = dirent.k->p.offset; - if (!dir_emit(ctx, name.name, - name.len, - target.inum, - vfs_d_type(dirent.v->d_type))) - break; - ctx->pos = dirent.k->p.offset + 1; - /* * read_target looks up subvolumes, we can overflow paths if the * directory has many subvolumes in it + * + * XXX: btree_trans_too_many_iters() is something we'd like to + * get rid of, and there's no good reason to be using it here + * except that we don't yet have a for_each_btree_key() helper + * that does subvolume_get_snapshot(). */ - ret = btree_trans_too_many_iters(trans); - if (ret) + ret = drop_locks_do(trans, + bch2_dir_emit(ctx, dirent, target)) ?: + btree_trans_too_many_iters(trans); + if (ret) { + ret = ret < 0 ? ret : 0; break; + } } bch2_trans_iter_exit(trans, &iter); err: diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index bee55cca2a..24037e6e0a 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,11 +4,11 @@ #include "str_hash.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ @@ -38,11 +38,11 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum, int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, - bch_str_hash_flags_t); + enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 06a7df529b..521a86df5e 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -18,9 +18,8 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static int bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); @@ -177,7 +176,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); struct bch_disk_group_cpu *dst; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) continue; g = BCH_MEMBER_GROUP(&m); @@ -523,7 +522,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { *res = dev_to_target(ca->dev_idx); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); return 0; } @@ -588,7 +587,7 @@ static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsi case TARGET_DEV: { struct bch_member m = bch2_sb_member_get(sb, t.dev); - if (bch2_dev_exists(sb, t.dev)) { + if (bch2_member_exists(sb, t.dev)) { prt_printf(out, "Device "); pr_uuid(out, m.uuid.b); prt_printf(out, " (%u)", t.dev); diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h new file mode 100644 index 0000000000..698990bbf1 --- /dev/null +++ b/fs/bcachefs/disk_groups_format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H +#define _BCACHEFS_DISK_GROUPS_FORMAT_H + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 0a49c2e995..83e279d418 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -107,7 +107,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; @@ -163,146 +163,199 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, /* Triggers: */ -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int __mark_stripe_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + struct bpos bucket, + struct bch_alloc_v4 *a, + enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; + unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; + struct printbuf buf = PRINTBUF; int ret = 0; + struct bch_fs *c = trans->c; if (deleting) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors); - if (ret) - goto err; - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { + if (bch2_trans_inconsistent_on(a->stripe || + a->stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->stripe, s.k->p.offset, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - a->v.dirty_sectors, - s.k->p.offset)) { + if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { + if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || + a->stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", + bucket.inode, bucket.offset, a->gen, + a->stripe, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -EIO; goto err; } - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, + "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + bch2_data_type_str(data_type), + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(parity && + (a->dirty_sectors != -sectors || + a->cached_sectors), trans, + "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s", + bucket.inode, bucket.offset, a->gen, + a->dirty_sectors, + a->cached_sectors, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err; + } } - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; + if (sectors) { + ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, + a->gen, a->data_type, &a->dirty_sectors); + if (ret) + goto err; + } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; + if (!deleting) { + a->stripe = s.k->p.offset; + a->stripe_redundancy = s.v->nr_redundant; + } else { + a->stripe = 0; + a->stripe_redundancy = 0; + } + + alloc_data_type_set(a, data_type); err: - bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); return ret; } static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) + struct bkey_s_c_stripe s, + unsigned ptr_idx, bool deleting, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; struct printbuf buf = PRINTBUF; int ret = 0; - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - /* * XXX doesn't handle deletion */ - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); - - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; + struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); + if (unlikely(!ca)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -EIO; goto err; } - bucket_lock(g); - old = *g; + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors); - if (ret) - goto err; + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update(trans, bucket); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + } - g->data_type = data_type; - g->dirty_sectors += sectors; + if (flags & BTREE_TRIGGER_gc) { + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -EIO; + goto err_unlock; + } - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; + ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + if (!ret) { + alloc_to_bucket(g, new); + bch2_dev_usage_update(c, ca, &old, &new, 0, true); + } + bucket_unlock(g); +err_unlock: + percpu_up_read(&c->mark_lock); + } err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, &old, &new); - percpu_up_read(&c->mark_lock); + bch2_dev_put(ca); printbuf_exit(&buf); return ret; } +static int mark_stripe_buckets(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + + BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks); + + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false, flags); + if (ret) + return ret; + } + + if (old_s) { + int ret = mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true, flags); + if (ret) + return ret; + } + } + + return 0; +} + int bch2_trigger_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_s_c new = _new.s_c; struct bch_fs *c = trans->c; @@ -312,7 +365,10 @@ int bch2_trigger_stripe(struct btree_trans *trans, const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (unlikely(flags & BTREE_TRIGGER_check_repair)) + return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags); + + if (flags & BTREE_TRIGGER_transactional) { /* * If the pointers aren't changing, we don't need to do anything: */ @@ -347,31 +403,12 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - for (unsigned i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(new), i, false); - if (ret) - return ret; - } - - if (old_s) { - int ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); - if (ret) - return ret; - } - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; } - if (flags & BTREE_TRIGGER_ATOMIC) { + if (flags & BTREE_TRIGGER_atomic) { struct stripe *m = genradix_ptr(&c->stripes, idx); if (!m) { @@ -410,7 +447,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, } } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); @@ -439,13 +476,11 @@ int bch2_trigger_stripe(struct btree_trans *trans, */ memset(m->block_sectors, 0, sizeof(m->block_sectors)); - for (unsigned i = 0; i < new_s->nr_blocks; i++) { - int ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } + int ret = mark_stripe_buckets(trans, old, new, flags); + if (ret) + return ret; - int ret = bch2_update_replicas(c, new, &m->r.e, + ret = bch2_update_replicas(c, new, &m->r.e, ((s64) m->sectors * m->nr_redundant), 0, true); if (ret) { @@ -608,19 +643,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf err = PRINTBUF; - struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); + if (ca) { + struct printbuf err = PRINTBUF; - prt_str(&err, "stripe "); - bch2_csum_err_msg(&err, v->csum_type, want, got); - prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); - bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); - bch_err_ratelimited(ca, "%s", err.buf); - printbuf_exit(&err); + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); - clear_bit(i, buf->valid); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + clear_bit(i, buf->valid); break; } @@ -687,10 +724,12 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); - if (ptr_stale(ca, ptr)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { bch_err_ratelimited(ca->fs, - "error %s stripe: stale pointer after io", - bio_data_dir(bio) == READ ? "reading from" : "writing to"); + "error %s stripe: stale/invalid pointer (%i) after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to", + stale); clear_bit(ec_bio->idx, ec_bio->buf->valid); } @@ -705,25 +744,28 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); - if (ptr_stale(ca, ptr)) { - bch_err_ratelimited(c, - "error %s stripe: stale pointer", - rw == READ ? "reading from" : "writing to"); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + if (!ca) { clear_bit(idx, buf->valid); return; } - if (!bch2_dev_get_ioref(ca, rw)) { + int stale = dev_ptr_stale(ca, ptr); + if (stale) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer (%i)", + rw == READ ? "reading from" : "writing to", + stale); clear_bit(idx, buf->valid); return; } + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { @@ -769,7 +811,7 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_SLOTS); + POS(0, idx), BTREE_ITER_slots); ret = bkey_err(k); if (ret) goto err; @@ -1060,7 +1102,7 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1131,7 +1173,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); + new->k.p, BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1173,6 +1215,7 @@ err: } static int ec_stripe_update_extent(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bpos *bp_pos) @@ -1183,13 +1226,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; - struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_ptr *ec_ptr = NULL; struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, bucket, gen, - bp_pos, &bp, BTREE_ITER_CACHED); + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, + bp_pos, &bp, BTREE_ITER_cached); if (ret) return ret; if (bpos_eq(*bp_pos, SPOS_MAX)) @@ -1214,7 +1257,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); ret = bkey_err(k); if (ret) return ret; @@ -1272,17 +1315,21 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b { struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - struct bch_extent_ptr bucket = v->ptrs[block]; - struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bch_extent_ptr ptr = v->ptrs[block]; struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); + if (!ca) + return -EIO; + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + while (1) { ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, bucket_pos, bucket.gen, - s, &bp_pos)); + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); if (ret) break; if (bkey_eq(bp_pos, POS_MAX)) @@ -1291,6 +1338,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b bp_pos = bpos_nosnap_successor(bp_pos); } + bch2_dev_put(ca); return ret; } @@ -1321,20 +1369,18 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - unsigned offset = ca->mi.bucket_size - ob->sectors_free; - int ret; - - if (!bch2_dev_get_ioref(ca, WRITE)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; } + unsigned offset = ca->mi.bucket_size - ob->sectors_free; memset(s->new_stripe.data[block] + (offset << 9), 0, ob->sectors_free << 9); - ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + int ret = blkdev_issue_zeroout(ca->disk_sb.bdev, ob->bucket * ca->mi.bucket_size + offset, ob->sectors_free, GFP_KERNEL, 0); @@ -1519,16 +1565,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) { struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct bch_dev *ca; - unsigned offset; - if (!ob) return NULL; BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); - ca = bch_dev_bkey_exists(c, ob->dev); - offset = ca->mi.bucket_size - ob->sectors_free; + struct bch_dev *ca = ob_dev(c, ob); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } @@ -1937,7 +1980,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; @@ -2127,7 +2170,7 @@ int bch2_stripes_read(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_prefetch, k, ({ if (k.k->type != KEY_TYPE_stripe) continue; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f042616888..84a23eeb62 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,14 +6,15 @@ #include "buckets_types.h" #include "extents_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index dbe35b80bc..58612abf79 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,9 @@ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ + x(EEXIST, EEXIST_discard_in_flight_add) \ + x(EEXIST, EEXIST_subvolume_create) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 82a6656c94..d95c40f1b6 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -15,6 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) switch (c->opts.errors) { case BCH_ON_ERROR_continue: return false; + case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", @@ -176,6 +177,27 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return s; } +/* s/fix?/fixing/ s/recreate?/recreating/ */ +static void prt_actioning(struct printbuf *out, const char *action) +{ + unsigned len = strlen(action); + + BUG_ON(action[len - 1] != '?'); + --len; + + if (action[len - 1] == 'e') + --len; + + prt_bytes(out, action, len); + prt_str(out, "ing"); +} + +static const u8 fsck_flags_extra[] = { +#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags, + BCH_SB_ERRS() +#undef x +}; + int bch2_fsck_err(struct bch_fs *c, enum bch_fsck_flags flags, enum bch_sb_error_id err, @@ -186,6 +208,10 @@ int bch2_fsck_err(struct bch_fs *c, bool print = true, suppressing = false, inconsistent = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + const char *action_orig = "fix?", *action = action_orig; + + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) @@ -197,6 +223,19 @@ int bch2_fsck_err(struct bch_fs *c, prt_vprintf(out, fmt, args); va_end(args); + /* Custom fix/continue/recreate/etc.? */ + if (out->buf[out->pos - 1] == '?') { + const char *p = strrchr(out->buf, ','); + if (p) { + out->pos = p - out->buf; + action = kstrdup(p + 2, GFP_KERNEL); + if (!action) { + ret = -ENOMEM; + goto err; + } + } + } + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { @@ -208,12 +247,16 @@ int bch2_fsck_err(struct bch_fs *c, if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - return ret; + goto err; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (!s->last_msg) { + mutex_unlock(&c->fsck_error_msgs_lock); + ret = -ENOMEM; + goto err; + } if (c->opts.ratelimit_errors && !(flags & FSCK_NO_RATELIMIT) && @@ -232,14 +275,22 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if ((flags & FSCK_CAN_FIX) && + (flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); inconsistent = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { prt_str(out, ", continuing"); @@ -254,16 +305,16 @@ int bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - int ask; + prt_str(out, ", "); + prt_str(out, action); - prt_str(out, ": fix?"); if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(c); + int ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -276,10 +327,12 @@ int bch2_fsck_err(struct bch_fs *c, } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { - prt_str(out, ", fixing"); + prt_str(out, ", "); + prt_actioning(out, action); ret = -BCH_ERR_fsck_fix; } else { - prt_str(out, ", not fixing"); + prt_str(out, ", not "); + prt_actioning(out, action); } } else if (flags & FSCK_NEED_FSCK) { prt_str(out, " (run fsck to correct)"); @@ -311,8 +364,6 @@ int bch2_fsck_err(struct bch_fs *c, mutex_unlock(&c->fsck_error_msgs_lock); - printbuf_exit(&buf); - if (inconsistent) bch2_inconsistent_error(c); @@ -322,7 +373,10 @@ int bch2_fsck_err(struct bch_fs *c, set_bit(BCH_FS_errors_not_fixed, &c->flags); set_bit(BCH_FS_error, &c->flags); } - +err: + if (action != action_orig) + kfree(action); + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 36caedf72d..777711504c 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -108,13 +108,6 @@ struct fsck_err_state { char *last_msg; }; -enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, -}; - #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) __printf(4, 5) __cold diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b9033bb4f1..5f4fecb358 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -72,7 +72,7 @@ static int count_iters_for_insert(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret2) { + BTREE_ITER_slots, r_k, ret2) { if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1a331e5392..410b8bd81b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -71,6 +71,12 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, } } +static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu(c, dev); + return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; +} + /* * returns true if p1 is better than p2: */ @@ -79,11 +85,8 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p2) { if (likely(!p1.idx && !p2.idx)) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + u64 l1 = dev_latency(c, p1.ptr.dev); + u64 l2 = dev_latency(c, p2.ptr.dev); /* Pick at random, biased in favor of the faster device: */ @@ -109,21 +112,21 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_dev_io_failures *f; - struct bch_dev *ca; int ret = 0; if (k.k->type == KEY_TYPE_error) return -EIO; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ - if (p.ptr.unwritten) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); + if (p.ptr.unwritten) { + ret = 0; + break; + } /* * If there are any dirty pointers it's an error if we can't @@ -132,7 +135,9 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!ret && !p.ptr.cached) ret = -EIO; - if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; @@ -141,12 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && - !bch2_dev_is_readable(ca)) + if (!p.idx && !ca) p.idx++; - if (bch2_force_reconstruct_read && - !p.idx && p.has_ec) + if (!p.idx && p.has_ec && bch2_force_reconstruct_read) + p.idx++; + + if (!p.idx && !bch2_dev_is_readable(ca)) p.idx++; if (p.idx >= (unsigned) p.has_ec + 1) @@ -158,6 +164,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, *pick = p; ret = 1; } + rcu_read_unlock(); return ret; } @@ -165,7 +172,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -186,7 +193,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); @@ -201,6 +208,11 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, c, err, btree_ptr_v2_min_key_bad, "min_key > key"); + if (flags & BCH_VALIDATE_write) + bkey_fsck_err_on(!bp.v->sectors_written, + c, err, btree_ptr_v2_written_0, + "sectors_written == 0"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; @@ -247,7 +259,6 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) const union bch_extent_entry *en_r; struct extent_ptr_decoded lp, rp; bool use_right_ptr; - struct bch_dev *ca; en_l = l_ptrs.start; en_r = r_ptrs.start; @@ -278,8 +289,12 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp.ptr.dev); - if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); + bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); + rcu_read_unlock(); + + if (!same_bucket) return false; if (lp.has_ec != rp.has_ec || @@ -385,7 +400,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); @@ -667,16 +682,16 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - return __extent_ptr_durability(ca, p); + return ca ? __extent_ptr_durability(ca, p) : 0; } unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_failed) + if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed) return 0; return __extent_ptr_durability(ca, p); @@ -689,8 +704,10 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -702,9 +719,11 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); + rcu_read_unlock(); return durability; } @@ -833,8 +852,6 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr; - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } @@ -860,14 +877,21 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; + bool ret = false; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && + (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + !dev_ptr_stale_rcu(ca, ptr))) { + ret = true; + break; + } + rcu_read_unlock(); - return false; + return ret; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -969,21 +993,23 @@ void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bch_extent_ptr *ptr; + struct bch_dev *ca; + rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + (ca = bch2_dev_rcu(c, ptr->dev)) && + dev_ptr_stale_rcu(ca, ptr) > 0); + rcu_read_unlock(); return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - + out->atomic++; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, @@ -998,11 +1024,14 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc prt_str(out, " cached"); if (ptr->unwritten) prt_str(out, " unwritten"); - if (b >= ca->mi.first_bucket && - b < ca->mi.nbuckets && - ptr_stale(ca, ptr)) + int stale = dev_ptr_stale_rcu(ca, ptr); + if (stale > 0) prt_printf(out, " stale"); + else if (stale) + prt_printf(out, " invalid"); } + rcu_read_unlock(); + --out->atomic; } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -1069,55 +1098,50 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata, struct printbuf *err) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - u64 bucket; - u32 bucket_offset; - struct bch_dev *ca; int ret = 0; - if (!bch2_dev_exists2(c, ptr->dev)) { - /* - * If we're in the write path this key might have already been - * overwritten, and we could be seeing a device that doesn't - * exist anymore due to racing with device removal: - */ - if (flags & BKEY_INVALID_WRITE) - return 0; - - bkey_fsck_err(c, err, ptr_to_invalid_device, - "pointer to invalid device (%u)", ptr->dev); + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) { + rcu_read_unlock(); + return 0; } + u32 bucket_offset; + u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + unsigned first_bucket = ca->mi.first_bucket; + u64 nbuckets = ca->mi.nbuckets; + unsigned bucket_size = ca->mi.bucket_size; + rcu_read_unlock(); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr2) bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, ptr_to_duplicate_device, "multiple pointers to same device (%u)", ptr->dev); - bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + bkey_fsck_err_on(bucket >= nbuckets, c, err, ptr_after_last_bucket, - "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + "pointer past last bucket (%llu > %llu)", bucket, nbuckets); + bkey_fsck_err_on(bucket < first_bucket, c, err, ptr_before_first_bucket, - "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + "pointer before first bucket (%llu < %u)", bucket, first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err, ptr_spans_multiple_buckets, "pointer spans multiple buckets (%u + %u > %u)", - bucket_offset, size_ondisk, ca->mi.bucket_size); + bucket_offset, size_ondisk, bucket_size); fsck_err: return ret; } int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1193,7 +1217,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 528e817eac..1ade959652 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,7 @@ struct bch_fs; struct btree_trans; -enum bkey_invalid_flags; +enum bch_validate_flags; /* extent entries: */ @@ -406,12 +406,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -448,7 +448,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -654,7 +654,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, do { \ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ \ - _ptr = &_ptrs.start->ptr; \ + struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \ \ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ if (_cond) { \ @@ -680,7 +680,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 0f955c3c76..2eaffe37b5 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -171,7 +171,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, swap_r_func_t swap_func, const void *priv) { - int i, c, r; + int i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) @@ -188,17 +188,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, /* heapify */ for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = i; k = j * 2 + 1, k + 1 < n;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < n && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == n) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j != i;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } @@ -206,17 +211,22 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size, for (i = n - 1; i > 0; --i) { eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = 0; k = j * 2 + 1, k + 1 < i;) + j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < i && - eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == i) + j = j * 2 + 1; - if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) + j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + /* Shift the element into its correct place. */ + for (k = j; j;) { + j = (j - 1) / 2; + eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); } } } @@ -232,3 +242,64 @@ void eytzinger0_sort(void *base, size_t n, size_t size, return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } + +#if 0 +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/ktime.h> + +static u64 cmp_count; + +static int mycmp(const void *a, const void *b) +{ + u32 _a = *(u32 *)a; + u32 _b = *(u32 *)b; + + cmp_count++; + if (_a < _b) + return -1; + else if (_a > _b) + return 1; + else + return 0; +} + +static int test(void) +{ + size_t N, i; + ktime_t start, end; + s64 delta; + u32 *arr; + + for (N = 10000; N <= 100000; N += 10000) { + arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); + cmp_count = 0; + + for (i = 0; i < N; i++) + arr[i] = get_random_u32(); + + start = ktime_get(); + eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); + end = ktime_get(); + + delta = ktime_us_delta(end, start); + printk(KERN_INFO "time: %lld\n", delta); + printk(KERN_INFO "comparisons: %lld\n", cmp_count); + + u32 prev = 0; + + eytzinger0_for_each(i, N) { + if (prev > arr[i]) + goto err; + prev = arr[i]; + } + + kfree(arr); + } + return 0; + +err: + kfree(arr); + return -1; +} +#endif diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 24840aee33..795f4fc0ba 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size); + return size ? rounddown_pow_of_two(size) : 0; } static inline unsigned eytzinger1_last(unsigned size) @@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return size + ? (size + 1 - rounddown_pow_of_two(size)) << 1 + : 0; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 624e6f9632..508d029ac5 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -42,7 +42,7 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -70,7 +70,7 @@ int bch2_create_trans(struct btree_trans *trans, struct bch_subvolume s; ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_CACHED, &s); + BTREE_ITER_cached, &s); if (ret) goto err; @@ -78,7 +78,7 @@ int bch2_create_trans(struct btree_trans *trans, } ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -163,7 +163,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -171,7 +171,7 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } - inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: @@ -198,16 +198,16 @@ int bch2_link_trans(struct btree_trans *trans, if (dir.subvol != inum.subvol) return -EXDEV; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) - goto err; + return ret; inode_u->bi_ctime = now; ret = bch2_inode_nlink_inc(inode_u); if (ret) - return ret; + goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; @@ -223,7 +223,7 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum.inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) goto err; @@ -255,19 +255,19 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + name, &inum, BTREE_ITER_intent); if (ret) goto err; ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -322,7 +322,7 @@ int bch2_unlink_trans(struct btree_trans *trans, ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -363,7 +363,7 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p struct bkey_i_subvolume *s = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -394,7 +394,7 @@ int bch2_rename_trans(struct btree_trans *trans, int ret; ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -403,7 +403,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (dst_dir.inum != src_dir.inum || dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; @@ -423,13 +423,13 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; if (dst_inum.inum) { ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto err; } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 39292e7ef3..54873ecc63 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -30,15 +30,8 @@ static void bch2_readpages_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK); bio_put(bio); } @@ -176,7 +169,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -264,7 +257,6 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts; - struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; @@ -276,6 +268,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_pagecache_add_get(inode); + struct btree_trans *trans = bch2_trans_get(c); while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, readpages_iter.folios.nr - @@ -296,10 +289,10 @@ void bch2_readahead(struct readahead_control *ractl) &readpages_iter); bch2_trans_unlock(trans); } + bch2_trans_put(trans); bch2_pagecache_add_put(inode); - bch2_trans_put(trans); darray_exit(&readpages_iter.folios); } @@ -408,7 +401,6 @@ static void bch2_writepage_io_done(struct bch_write_op *op) bio_for_each_folio_all(fi, bio) { struct bch_folio *s; - folio_set_error(fi.folio); mapping_set_error(fi.folio->mapping, -EIO); s = __bch2_folio(fi.folio); @@ -445,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -906,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index b889370a50..049b61bc9a 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -254,7 +254,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { + BTREE_ITER_slots, k, err) { if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; @@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) if (unlikely(ret)) goto err_put_write_ref; - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { + ret = -EINVAL; goto err_put_write_ref; + } inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index d359aa9b33..872283e5bd 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -214,7 +214,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 20b4047742..ef20b64033 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -202,7 +202,10 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) goto out; ret = bch2_flush_inode(c, inode); out: - return bch2_err_class(ret); + ret = bch2_err_class(ret); + if (ret == -EROFS) + ret = -EIO; + return ret; } /* truncate: */ @@ -594,7 +597,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); while (!ret && bkey_lt(iter.pos, end_pos)) { s64 i_sectors_delta = 0; @@ -1009,7 +1012,7 @@ retry: for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE, 0, false); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 3dc8630ff9..79a0c8732b 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) return ret; } -static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) { struct inode *dir; struct bch_inode_info *inode; @@ -373,7 +373,7 @@ retry: } if (dst_dentry->d_inode) { - error = -EEXIST; + error = -BCH_ERR_EEXIST_subvolume_create; goto err3; } @@ -406,9 +406,12 @@ retry: !arg.src_ptr) snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + down_write(&c->snapshot_create_lock); inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); + up_write(&c->snapshot_create_lock); + error = PTR_ERR_OR_ZERO(inode); if (error) goto err3; @@ -429,16 +432,6 @@ err1: return error; } -static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, - struct bch_ioctl_subvolume arg) -{ - down_write(&c->snapshot_create_lock); - long ret = __bch2_ioctl_subvolume_create(c, filp, arg); - up_write(&c->snapshot_create_lock); - - return ret; -} - static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { @@ -548,7 +541,7 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { - case FS_IOC_GETFLAGS: + case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 6f114803c6..fa1fee05cf 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -90,7 +90,7 @@ retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT) ?: + BTREE_ITER_intent) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -188,16 +188,29 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino BUG_ON(!old); if (unlikely(old != inode)) { - __destroy_inode(&inode->v); - kmem_cache_free(bch2_inode_cache, inode); + /* + * bcachefs doesn't use I_NEW; we have no use for it since we + * only insert fully created inodes in the inode hash table. But + * discard_new_inode() expects it to be set... + */ + inode->v.i_flags |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); + discard_new_inode(&inode->v); inode = old; } else { mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... + * Again, I_NEW makes no sense for bcachefs. This is only needed + * for clearing I_NEW, but since the inode was already fully + * created and initialized we didn't actually want + * inode_insert5() to set it for us. */ unlock_new_inode(&inode->v); } @@ -213,19 +226,45 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino _ret; \ }) +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + BUG(); +} + +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +{ + struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + inode->ei_flags = 0; + mutex_init(&inode->ei_quota_lock); + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + inode->v.i_state = 0; + + if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + kmem_cache_free(bch2_inode_cache, inode); + return NULL; + } + + return inode; +} + /* * Allocate a new inode, dropping/retaking btree locks if necessary: */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct bch_inode_info *inode = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, - to_bch_ei(new_inode(c->vfs_sb))); + __bch2_new_inode(trans->c)); if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); @@ -290,7 +329,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = to_bch_ei(new_inode(c->vfs_sb)); + inode = __bch2_new_inode(c); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -323,7 +362,7 @@ retry: inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_WITH_UPDATES, &subvol) ?: + BTREE_ITER_with_updates, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -376,17 +415,14 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter dirent_iter = {}; subvol_inum inum = {}; + struct printbuf buf = PRINTBUF; - int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + int ret = bkey_err(k); if (ret) return ERR_PTR(ret); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); - ret = bkey_err(k); - if (ret) - goto err; - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); if (ret > 0) ret = -ENOENT; @@ -406,20 +442,31 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); - if (bch2_err_matches(ret, ENOENT)) { - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - bch_err(c, "%s points to missing inode", buf.buf); - printbuf_exit(&buf); - } + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + c, "dirent to missing inode:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; + /* regular files may have hardlinks: */ + if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, &inode_u), + buf.buf))) { + ret = -ENOENT; + goto err; + } + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); inode = bch2_inode_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); return inode; err: inode = ERR_PTR(ret); @@ -787,7 +834,7 @@ retry: acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), - BTREE_ITER_INTENT); + BTREE_ITER_intent); if (ret) goto btree_err; @@ -844,6 +891,9 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; + stat->subvol = inode->ei_subvol; + stat->result_mask |= STATX_SUBVOL; + if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); @@ -1040,6 +1090,10 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); + + ret = bch2_trans_relock(trans); + if (ret) + break; } start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); @@ -1106,6 +1160,8 @@ static int bch2_open(struct inode *vinode, struct file *file) return ret; } + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(vinode, file); } @@ -1115,6 +1171,7 @@ static const struct file_operations bch_file_operations = { .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, + .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1198,7 +1255,6 @@ static const struct address_space_operations bch_address_space_operations = { .write_end = bch2_write_end, .invalidate_folio = bch2_invalidate_folio, .release_folio = bch2_release_folio, - .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif @@ -1447,11 +1503,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); - if (BCH_SUBVOLUME_SNAP(subvol)) - set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - else - clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); - inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; @@ -1463,6 +1514,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1487,34 +1541,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, mapping_set_large_folios(inode->v.i_mapping); } -static struct inode *bch2_alloc_inode(struct super_block *sb) -{ - struct bch_inode_info *inode; - - inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); - if (!inode) - return NULL; - - inode_init_once(&inode->v); - mutex_init(&inode->ei_update_lock); - two_state_lock_init(&inode->ei_pagecache_lock); - INIT_LIST_HEAD(&inode->ei_vfs_inode_list); - mutex_init(&inode->ei_quota_lock); - - return &inode->v; -} - -static void bch2_i_callback(struct rcu_head *head) +static void bch2_free_inode(struct inode *vinode) { - struct inode *vinode = container_of(head, struct inode, i_rcu); - struct bch_inode_info *inode = to_bch_ei(vinode); - - kmem_cache_free(bch2_inode_cache, inode); -} - -static void bch2_destroy_inode(struct inode *vinode) -{ - call_rcu(&vinode->i_rcu, bch2_i_callback); + kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, @@ -1822,7 +1851,7 @@ static int bch2_unfreeze(struct super_block *sb) static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, - .destroy_inode = bch2_destroy_inode, + .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, @@ -1925,8 +1954,7 @@ got_sb: if (IS_ERR(sb)) { ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); + goto err; } c = sb->s_fs_info; @@ -1954,6 +1982,7 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); sb->s_uuid = c->sb.user_uuid; + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -2002,6 +2031,17 @@ out: err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); +err: + if (ret) + pr_err("error: %s", bch2_err_str(ret)); + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; return ERR_PTR(bch2_err_class(ret)); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8e2010212c..921bcdb3e5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; } - - ret = bch2_inode_unpack(k, inode); -err: + ret = -BCH_ERR_ENOENT_inode; +found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; @@ -127,13 +123,13 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0, snapshot); + struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); + int ret = bkey_err(k); if (ret) return ret; - d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); @@ -154,12 +150,12 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) dir_hash_info = bch2_hash_info_init(c, &dir_inode); - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); @@ -274,9 +270,9 @@ create_lostfound: &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, - BCH_HASH_SET_MUST_CREATE) ?: + STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch_err_msg(c, ret, "creating lost+found"); bch2_trans_iter_exit(trans, &lostfound_iter); @@ -333,7 +329,7 @@ static int reattach_inode(struct btree_trans *trans, &name, inode->bi_subvol ?: inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + STR_HASH_must_create); if (ret) return ret; @@ -486,14 +482,9 @@ static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 in return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); } -struct snapshots_seen_entry { - u32 id; - u32 equiv; -}; - struct snapshots_seen { struct bpos pos; - DARRAY(struct snapshots_seen_entry) ids; + snapshot_id_list ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) @@ -508,20 +499,15 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - struct snapshots_seen_entry *i, n = { - .id = id, - .equiv = bch2_snapshot_equiv(c, id), - }; - int ret = 0; - + u32 *i; __darray_for_each(s->ids, i) { - if (i->id == id) + if (*i == id) return 0; - if (i->id > id) + if (*i > id) break; } - ret = darray_insert_item(&s->ids, i - s->ids.data, n); + int ret = darray_insert_item(&s->ids, i - s->ids.data, id); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); @@ -531,42 +517,11 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry n = { - .id = pos.snapshot, - .equiv = bch2_snapshot_equiv(c, pos.snapshot), - }; - int ret = 0; - if (!bkey_eq(s->pos, pos)) s->ids.nr = 0; - s->pos = pos; - s->pos.snapshot = n.equiv; - - darray_for_each(s->ids, i) { - if (i->id == n.id) - return 0; - /* - * We currently don't rigorously track for snapshot cleanup - * needing to be run, so it shouldn't be a fsck error yet: - */ - if (i->equiv == n.equiv) { - bch_err(c, "snapshot deletion did not finish:\n" - " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_id_str(btree_id), - pos.inode, pos.offset, - i->id, n.id, n.equiv); - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); - } - } - - ret = darray_push(&s->ids, n); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; + return snapshot_list_add_nodup(c, &s->ids, pos.snapshot); } /** @@ -586,12 +541,10 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ssize_t i; EBUG_ON(id > ancestor); - EBUG_ON(!bch2_snapshot_is_equiv(c, id)); - EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); + EBUG_ON(ancestor != darray_last(seen->ids)); if (id == ancestor) return true; @@ -610,9 +563,9 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see */ for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i].equiv >= id; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) return false; return true; @@ -643,9 +596,6 @@ static int ref_visible2(struct bch_fs *c, u32 src, struct snapshots_seen *src_seen, u32 dst, struct snapshots_seen *dst_seen) { - src = bch2_snapshot_equiv(c, src); - dst = bch2_snapshot_equiv(c, dst); - if (dst > src) { swap(dst, src); swap(dst_seen, src_seen); @@ -692,7 +642,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, - .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + .snapshot = inode.k->p.snapshot, })); } @@ -708,7 +658,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; @@ -728,21 +678,20 @@ static struct inode_walker_entry * lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { bool is_whiteout = k.k->type == KEY_TYPE_whiteout; - u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) goto found; return NULL; found: - BUG_ON(snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->snapshot); - if (snapshot != i->snapshot && !is_whiteout) { + if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = snapshot; + new.snapshot = k.k->p.snapshot; new.count = 0; struct printbuf buf = PRINTBUF; @@ -751,10 +700,10 @@ found: bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, snapshot, i->snapshot, buf.buf); + w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); printbuf_exit(&buf); - while (i > w->inodes.data && i[-1].snapshot > snapshot) + while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; @@ -786,10 +735,10 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, return lookup_inode_for_snapshot(trans->c, w, k); } -static int __get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -799,19 +748,17 @@ static int __get_visible_inodes(struct btree_trans *trans, w->inodes.nr = 0; for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - + BTREE_ITER_all_snapshots, k, ret) { if (k.k->p.offset != inum) break; - if (!ref_visible(c, s, s->pos.snapshot, equiv)) + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) continue; if (bkey_is_inode(k.k)) add_inode(c, w, k); - if (equiv >= s->pos.snapshot) + if (k.k->p.snapshot >= s->pos.snapshot) break; } bch2_trans_iter_exit(trans, &iter); @@ -819,25 +766,6 @@ static int __get_visible_inodes(struct btree_trans *trans, return ret; } -static int check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; -fsck_err: - printbuf_exit(&buf); - return ret; -} - static int hash_redo_key(struct btree_trans *trans, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, @@ -861,8 +789,8 @@ static int hash_redo_key(struct btree_trans *trans, bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, - BCH_HASH_SET_MUST_CREATE, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + STR_HASH_must_create| + BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } @@ -891,7 +819,7 @@ static int hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc.btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; @@ -1032,7 +960,7 @@ static int check_inode(struct btree_trans *trans, bool do_update = false; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) goto err; if (ret) @@ -1233,7 +1161,7 @@ int bch2_check_inodes(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_inode(trans, &iter, k, &prev, &s, full))); @@ -1362,8 +1290,8 @@ static int overlapping_extents_found(struct btree_trans *trans, BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); bch2_trans_iter_init(trans, &iter1, btree, pos1, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOT_EXTENTS); + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) @@ -1425,7 +1353,7 @@ static int overlapping_extents_found(struct btree_trans *trans, trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + BTREE_UPDATE_internal_snapshot_node, k1, k2) ?: bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); @@ -1466,7 +1394,6 @@ static int check_overlapping_extents(struct btree_trans *trans, struct snapshots_seen *seen, struct extent_ends *extent_ends, struct bkey_s_c k, - u32 equiv, struct btree_iter *iter, bool *fixed) { @@ -1535,12 +1462,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv = k.k->p; int ret = 0; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -1589,8 +1513,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - ret = check_overlapping_extents(trans, s, extent_ends, k, - equiv.snapshot, iter, + ret = check_overlapping_extents(trans, s, extent_ends, k, iter, &inode->recalculate_sums); if (ret) goto err; @@ -1607,8 +1530,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (; inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > equiv.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + if (i->snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; if (k.k->type != KEY_TYPE_whiteout) { @@ -1625,7 +1548,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(&iter2, i->snapshot); ret = bch2_btree_iter_traverse(&iter2) ?: bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); if (ret) goto err; @@ -1652,7 +1575,7 @@ fsck_err: bch_err_fn(c, ret); return ret; delete: - ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -1673,7 +1596,7 @@ int bch2_check_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1698,7 +1621,7 @@ int bch2_check_indirect_extents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_disk_reservation_put(c, &res); @@ -1754,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) trans_was_restarted(trans, restart_count); } +noinline_for_stack static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1767,6 +1691,15 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, if (inode_points_to_dirent(target, d)) return 0; + if (bch2_inode_should_have_bp(target) && + !fsck_err(c, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n "), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto out_noiter; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; @@ -1835,11 +1768,13 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); +out_noiter: printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } +noinline_for_stack static int check_dirent_target(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, @@ -1914,6 +1849,7 @@ found: return ret; } +noinline_for_stack static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d) { @@ -2052,18 +1988,14 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; - struct bpos equiv; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; } - equiv = k.k->p; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); if (ret) goto err; @@ -2104,7 +2036,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); goto out; } @@ -2140,14 +2072,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } else { - ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); if (ret) goto err; if (fsck_err_on(!target->inodes.nr, c, dirent_to_missing_inode, - "dirent points to missing inode: (equiv %u)\n%s", - equiv.snapshot, + "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -2164,7 +2095,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) i->count++; } out: @@ -2191,7 +2122,7 @@ int bch2_check_dirents(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2214,7 +2145,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker_entry *i; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) @@ -2255,7 +2186,7 @@ int bch2_check_xattrs(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -2422,7 +2353,7 @@ int bch2_check_subvolume_structure(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_path(trans, &iter, k))); bch_err_fn(c, ret); @@ -2457,7 +2388,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode; struct printbuf buf = PRINTBUF; - u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); + u32 snapshot = inode_k.k->p.snapshot; int ret = 0; p->nr = 0; @@ -2559,9 +2490,9 @@ int bch2_check_directory_structure(struct bch_fs *c) ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ if (!bkey_is_inode(k.k)) continue; @@ -2661,9 +2592,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ if (!bkey_is_inode(k.k)) continue; @@ -2704,9 +2635,9 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); if (ret) break; @@ -2717,8 +2648,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links if (d.v->d_type != DT_DIR && d.v->d_type != DT_SUBVOL) inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); + le64_to_cpu(d.v->d_inum), d.k->p.snapshot); } 0; }))); @@ -2781,7 +2711,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { @@ -2849,7 +2779,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, u->v.front_pad = 0; u->v.back_pad = 0; - return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun); } int bch2_fix_reflink_p(struct bch_fs *c) @@ -2860,8 +2790,8 @@ int bch2_fix_reflink_p(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_intent|BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 0f95d7fb5e..aafa79fa63 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -339,7 +339,7 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_CACHED); + flags|BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -371,7 +371,7 @@ int bch2_inode_peek(struct btree_trans *trans, int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - enum btree_update_flags flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_inode_buf *inode_p; @@ -399,7 +399,7 @@ int __bch2_fsck_write_inode(struct btree_trans *trans, return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } int bch2_fsck_write_inode(struct btree_trans *trans, @@ -473,7 +473,7 @@ fsck_err: } int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -490,7 +490,7 @@ fsck_err: } int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); @@ -507,7 +507,7 @@ fsck_err: } int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); @@ -535,29 +535,19 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { printbuf_indent_add(out, 2); - prt_printf(out, "mode=%o", inode->bi_mode); - prt_newline(out); + prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); - prt_printf(out, " (%x)", inode->bi_flags); - prt_newline(out); + prt_printf(out, " (%x)\n", inode->bi_flags); - prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); - prt_newline(out); - - prt_printf(out, "bi_size=%llu", inode->bi_size); - prt_newline(out); - - prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); - prt_newline(out); - - prt_printf(out, "bi_version=%llu", inode->bi_version); - prt_newline(out); + prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "bi_size=%llu\n", inode->bi_size); + prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); + prt_printf(out, "bi_version=%llu\n", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, #_name "=%llu", (u64) inode->_name); \ - prt_newline(out); + prt_printf(out, #_name "=%llu\n", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x printbuf_indent_sub(out, 2); @@ -604,11 +594,11 @@ int bch2_trigger_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { if (nr) { int ret = bch2_replicas_deltas_realloc(trans, 0); if (ret) @@ -627,13 +617,13 @@ int bch2_trigger_inode(struct btree_trans *trans, } } - if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & BTREE_TRIGGER_gc) { struct bch_fs *c = trans->c; percpu_down_read(&c->mark_lock); @@ -645,7 +635,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -762,8 +752,8 @@ int bch2_inode_create(struct btree_trans *trans, pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -824,7 +814,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); while (1) { bch2_trans_begin(trans); @@ -846,7 +836,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) + if (iter.flags & BTREE_ITER_is_extents) bch2_key_resize(&delete.k, bpos_min(end, k.k->p).offset - iter.pos.offset); @@ -895,7 +885,7 @@ retry: k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); + BTREE_ITER_intent|BTREE_ITER_cached); ret = bkey_err(k); if (ret) goto err; @@ -1055,7 +1045,7 @@ retry: bch2_trans_begin(trans); k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + SPOS(0, inum, snapshot), BTREE_ITER_intent); ret = bkey_err(k); if (ret) goto err; @@ -1100,7 +1090,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bch_inode_unpacked inode; int ret; - k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -1152,7 +1142,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, inode.bi_flags &= ~BCH_INODE_unlinked; ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch_err_msg(c, ret, "clearing inode unlinked flag"); if (ret) goto out; @@ -1199,7 +1189,7 @@ again: * flushed and we'd spin: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 0562980505..679f5f5e5d 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -6,19 +6,20 @@ #include "bkey_methods.h" #include "opts.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const char * const bch2_inode_opts[]; int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ @@ -49,7 +50,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ @@ -101,7 +102,7 @@ int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, enum btree_update_flags); + struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); static inline int bch2_inode_write(struct btree_trans *trans, struct btree_iter *iter, @@ -220,6 +221,14 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); +static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) +{ + bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; + + return S_ISDIR(inode->bi_mode) || + (!inode->bi_nlink && inode_has_bp); +} + struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 82f9170dab..4583c9386e 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -125,7 +125,7 @@ err_noprint: bch2_bkey_buf_exit(&old, c); if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); closure_sync(&cl); } @@ -198,7 +198,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, start), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); @@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, struct bch_inode_unpacked inode_u; int ret; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?: (inode_u.bi_size = new_i_size, 0) ?: bch2_inode_write(trans, &iter, &inode_u); @@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); bch2_trans_iter_exit(trans, &fpunch_iter); @@ -317,7 +317,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset offset <<= 9; len <<= 9; - ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -365,7 +365,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_intent); switch (op->v.state) { case LOGGED_OP_FINSERT_start: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 8a556e6d1a..ebf39ef72f 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -84,9 +84,10 @@ struct promote_op { }; static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), + .automatic_shrinking = true, }; static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, @@ -378,7 +379,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + rbio->read_pos, BTREE_ITER_slots); retry: rbio->bio.bi_status = 0; @@ -388,7 +389,6 @@ retry: bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, @@ -487,7 +487,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out; @@ -523,7 +523,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -541,7 +541,6 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct bio *src = &rbio->bio; struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; @@ -647,13 +646,15 @@ csum_err: prt_str(&buf, "data "); bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data %s", buf.buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } printbuf_exit(&buf); - - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: @@ -675,7 +676,7 @@ static void bch2_read_endio(struct bio *bio) struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; @@ -687,17 +688,21 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + if (bio->bi_status) { + if (ca) { + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status)); + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + } bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -758,32 +763,45 @@ err: } static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_dev *ca, struct bkey_s_c k, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); struct btree_iter iter; struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); - - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); - printbuf_indent_add(&buf, 2); - prt_newline(&buf); + PTR_BUCKET_POS(ca, &ptr), + BTREE_ITER_cached); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); + u8 *gen = bucket_gen(ca, iter.pos.offset); + if (gen) { - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); + printbuf_indent_add(&buf, 2); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { + bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *gen); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + } else { + prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", + iter.pos.inode, iter.pos.offset); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "first bucket %u nbuckets %llu\n", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); @@ -801,7 +819,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -832,7 +849,7 @@ retry_pick: goto err; } - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -842,9 +859,11 @@ retry_pick: */ if ((flags & BCH_READ_IN_RETRY) && !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick); + percpu_ref_put(&ca->io_ref); goto retry_pick; } @@ -859,8 +878,11 @@ retry_pick: * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (ca) + percpu_ref_put(&ca->io_ref); goto hole; + } iter.bi_size = pick.crc.compressed_size << 9; goto get_bio; @@ -965,7 +987,7 @@ get_bio: rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->hole = 0; rbio->retry = 0; @@ -981,6 +1003,9 @@ get_bio: rbio->promote = promote; INIT_WORK(&rbio->work, NULL); + if (flags & BCH_READ_NODECODE) + orig->pick = pick; + rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; @@ -995,7 +1020,7 @@ get_bio: * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -1113,7 +1138,7 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 40d7df7607..05e0cbef42 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); for_each_btree_key_upto_continue_norestart(iter, - new->k.p, BTREE_ITER_SLOTS, old, ret) { + new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -210,14 +210,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ - unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + unsigned inode_update_flags = BTREE_UPDATE_nojournal; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -259,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } ret = bch2_trans_update(trans, &iter, &inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); @@ -368,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, @@ -407,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) + : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOFS, &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -430,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = nocow || bch2_dev_get_ioref(ca, - type == BCH_DATA_btree ? READ : WRITE); + n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); + if (nocow) + n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { @@ -481,7 +481,6 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; - struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { @@ -650,7 +649,9 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + struct bch_dev *ca = wbio->have_ioref + ? bch2_dev_have_ref(c, wbio->dev) + : NULL; if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, op->pos.inode, @@ -661,8 +662,12 @@ static void bch2_write_endio(struct bio *bio) op->flags |= BCH_WRITE_IO_ERROR; } - if (wbio->nocow) + if (wbio->nocow) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + POS(ca->dev_idx, wbio->nocow_bucket), + BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); + } if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -1101,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, return false; e = bkey_s_c_to_extent(k); + + rcu_read_lock(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) + if (crc_is_encoded(p.crc) || p.has_ec) { + rcu_read_unlock(); return false; + } replicas += bch2_extent_ptr_durability(c, &p); } + rcu_read_unlock(); return replicas >= op->opts.data_replicas; } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - - for_each_keylist_key(&op->insert_keys, k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - } -} - static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, @@ -1158,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) @@ -1169,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) for_each_keylist_key(&op->insert_keys, orig) { int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1195,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { - bch2_nocow_write_unlock(op); - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { op->error = -EIO; } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) @@ -1226,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op) DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; - int ret; + int stale, ret; if (op->flags & BCH_WRITE_MOVE) return; @@ -1242,12 +1236,16 @@ retry: bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), - BTREE_ITER_SLOTS); + BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; + ret = bch2_trans_relock(trans); + if (ret) + break; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -1267,14 +1265,15 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bpos b = PTR_BUCKET_POS(c, ptr); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (unlikely(!ca)) + goto err_get_ioref; + + struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); - if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) - goto err_get_ioref; - /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, @@ -1293,14 +1292,15 @@ retry: bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); + struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); rcu_read_lock(); - bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); + u8 *gen = bucket_gen(ca, i->b.offset); + stale = !gen ? -1 : gen_after(*gen, i->gen); rcu_read_unlock(); if (unlikely(stale)) { @@ -1370,7 +1370,7 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); + percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); /* Fall back to COW path: */ goto out; @@ -1381,8 +1381,18 @@ err_bucket_stale: break; } - /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + struct printbuf buf = PRINTBUF; + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -EIO; + } else { + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + } + printbuf_exit(&buf); + goto err_get_ioref; } @@ -1491,7 +1501,11 @@ err: if ((op->flags & BCH_WRITE_SYNC) || (!(op->flags & BCH_WRITE_DONE) && !(op->flags & BCH_WRITE_IN_WORKER))) { - closure_sync(&op->cl); + if (closure_sync_timeout(&op->cl, HZ * 10)) { + bch2_print_allocator_stuck(c); + closure_sync(&op->cl); + } + __bch2_write_index(op); if (!(op->flags & BCH_WRITE_DONE)) @@ -1649,8 +1663,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); - prt_newline(out); + prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } @@ -1658,13 +1671,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) void bch2_fs_io_write_exit(struct bch_fs *c) { mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS)) + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; if (mempool_init_page_pool(&c->bio_bounce_pages, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index c7f97c2c48..6e878a6f2f 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -20,6 +20,7 @@ struct bch_write_bio { u64 submit_time; u64 inode_offset; + u64 nocow_bucket; struct bch_devs_list failed; u8 dev; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index a8b08e76d0..10b19791ec 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -53,29 +53,19 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_str(out, "seq:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); + prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_str(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); - prt_str(out, "size:"); - prt_tab(out); + prt_printf(out, "size:\t"); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_str(out, "expires:"); - prt_tab(out); - prt_printf(out, "%li jiffies", buf->expires - jiffies); - prt_newline(out); + prt_printf(out, "expires:\t"); + prt_printf(out, "%li jiffies\n", buf->expires - jiffies); - prt_str(out, "flags:"); - prt_tab(out); + prt_printf(out, "flags:\t"); if (buf->noflush) prt_str(out, "noflush "); if (buf->must_flush) @@ -87,9 +77,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 if (buf->write_started) prt_str(out, "write_started "); if (buf->write_allocated) - prt_str(out, "write allocated "); + prt_str(out, "write_allocated "); if (buf->write_done) - prt_str(out, "write done"); + prt_str(out, "write_done"); prt_newline(out); printbuf_indent_sub(out, 2); @@ -948,7 +938,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, + BCH_DATA_journal, cl); ret = PTR_ERR_OR_ZERO(ob[nr_got]); if (ret) break; @@ -956,7 +947,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, - ca->mi.bucket_size)); + ca->mi.bucket_size, BTREE_TRIGGER_transactional)); if (ret) { bch2_open_bucket_put(c, ob[nr_got]); bch_err_msg(c, ret, "marking new journal buckets"); @@ -1036,7 +1027,8 @@ err_unblock: for (i = 0; i < nr_got; i++) bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, - bu[i], BCH_DATA_free, 0)); + bu[i], BCH_DATA_free, 0, + BTREE_TRIGGER_transactional)); err_free: if (!new_fs) for (i = 0; i < nr_got; i++) @@ -1103,7 +1095,7 @@ unlock: return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { unsigned nr; int ret; @@ -1125,7 +1117,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); err: bch_err_fn(ca, ret); return ret; @@ -1137,7 +1129,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) if (ca->journal.nr) continue; - int ret = bch2_dev_journal_alloc(ca); + int ret = bch2_dev_journal_alloc(ca, true); if (ret) { percpu_ref_put(&ca->io_ref); return ret; @@ -1175,6 +1167,9 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) void bch2_fs_journal_stop(struct journal *j) { + if (!test_bit(JOURNAL_running, &j->flags)) + return; + bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); @@ -1187,12 +1182,16 @@ void bch2_fs_journal_stop(struct journal *j) bch2_journal_meta(j); journal_quiesce(j); + cancel_delayed_work_sync(&j->write_work); - BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_REPLAY_DONE, &j->flags) && - j->last_empty_seq != journal_cur_seq(j)); + WARN(!bch2_journal_error(j) && + test_bit(JOURNAL_replay_done, &j->flags) && + j->last_empty_seq != journal_cur_seq(j), + "journal shutdown error: cur seq %llu but last empty seq %llu", + journal_cur_seq(j), j->last_empty_seq); - cancel_delayed_work_sync(&j->write_work); + if (!bch2_journal_error(j)) + clear_bit(JOURNAL_running, &j->flags); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq) @@ -1266,7 +1265,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) spin_lock(&j->lock); - set_bit(JOURNAL_STARTED, &j->flags); + set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); @@ -1407,6 +1406,13 @@ int bch2_fs_journal_init(struct journal *j) /* debug: */ +static const char * const bch2_journal_flags_strs[] = { +#define x(n) #n, + JOURNAL_FLAGS() +#undef x + NULL +}; + void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -1414,20 +1420,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 24); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); s = READ_ONCE(j->reservations); + prt_printf(out, "flags:\t"); + prt_bitflags(out, bch2_journal_flags_strs, j->flags); + prt_newline(out); prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); - prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); - prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1436,48 +1445,44 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_newline(out); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); - prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "blocked:\t\t%u\n", j->blocked); + prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); - prt_printf(out, "current entry:\t\t"); + prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - prt_printf(out, "error"); + prt_printf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - prt_printf(out, "closed"); + prt_printf(out, "closed\n"); break; default: - prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - prt_newline(out); - prt_printf(out, "unwritten entries:"); - prt_newline(out); + prt_printf(out, "unwritten entries:\n"); bch2_journal_bufs_to_text(out, j); - prt_printf(out, - "replay done:\t\t%i\n", - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - prt_printf(out, "space:\n"); - prt_printf(out, "\tdiscarded\t%u:%u\n", + printbuf_indent_add(out, 2); + prt_printf(out, "discarded\t%u:%u\n", j->space[journal_space_discarded].next_entry, j->space[journal_space_discarded].total); - prt_printf(out, "\tclean ondisk\t%u:%u\n", + prt_printf(out, "clean ondisk\t%u:%u\n", j->space[journal_space_clean_ondisk].next_entry, j->space[journal_space_clean_ondisk].total); - prt_printf(out, "\tclean\t\t%u:%u\n", + prt_printf(out, "clean\t%u:%u\n", j->space[journal_space_clean].next_entry, j->space[journal_space_clean].total); - prt_printf(out, "\ttotal\t\t%u:%u\n", + prt_printf(out, "total\t%u:%u\n", j->space[journal_space_total].next_entry, j->space[journal_space_total].total); + printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1488,14 +1493,16 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", ca->dev_idx); - prt_printf(out, "\tnr\t\t%u\n", ja->nr); - prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); - prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); - prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); - prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); - prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); - prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + prt_printf(out, "dev %u:\n", ca->dev_idx); + printbuf_indent_add(out, 2); + prt_printf(out, "nr\t%u\n", ja->nr); + prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "discard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + printbuf_indent_sub(out, 2); } rcu_read_unlock(); @@ -1516,6 +1523,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 struct journal_entry_pin *pin; spin_lock(&j->lock); + if (!test_bit(JOURNAL_running, &j->flags)) { + spin_unlock(&j->lock); + return true; + } + *seq = max(*seq, j->pin.front); if (*seq >= j->pin.back) { @@ -1527,25 +1539,18 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 pin_list = journal_seq_pin(j, *seq); - prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); - prt_newline(out); + prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); printbuf_indent_add(out, 2); for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) - list_for_each_entry(pin, &pin_list->list[i], list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->list[i], list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); - if (!list_empty(&pin_list->flushed)) { - prt_printf(out, "flushed:"); - prt_newline(out); - } + if (!list_empty(&pin_list->flushed)) + prt_printf(out, "flushed:\n"); - list_for_each_entry(pin, &pin_list->flushed, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + list_for_each_entry(pin, &pin_list->flushed, list) + prt_printf(out, "\t%px %ps\n", pin, pin->flush); printbuf_indent_sub(out, 2); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 7c7528f839..bc6b9c39dc 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -372,7 +372,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re int ret; EBUG_ON(res->ref); - EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + EBUG_ON(!test_bit(JOURNAL_running, &j->flags)); res->u64s = u64s; @@ -418,8 +418,8 @@ struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { - BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - set_bit(JOURNAL_REPLAY_DONE, &j->flags); + BUG_ON(!test_bit(JOURNAL_running, &j->flags)); + set_bit(JOURNAL_replay_done, &j->flags); } void bch2_journal_unblock(struct journal *); @@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index eb1f9d6f5a..2326e2cb9c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,15 +17,38 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + for_each_member_device(c, ca) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); + m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); + } +} + +void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + + unsigned idx = le32_to_cpu(m.last_journal_bucket); + if (idx < ca->journal.nr) + ca->journal.cur_idx = idx; + unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); + if (offset <= ca->mi.bucket_size) + ca->journal.sectors_free = ca->mi.bucket_size - offset; + } + mutex_unlock(&c->sb_lock); +} + void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); - u64 offset; - - div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); - if (i != j->ptrs.data) prt_printf(out, " "); prt_printf(out, "%u:%u:%u (sector %llu)", @@ -122,6 +145,10 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (!c->journal.oldest_seq_found_ondisk || + le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) + c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); + /* Is this entry older than the range we need? */ if (!c->opts.read_entire_journal && le64_to_cpu(j->seq) < jlist->last_seq) @@ -272,7 +299,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BKEY_INVALID_WRITE) { \ + switch (flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -301,9 +328,9 @@ static int journal_validate_key(struct bch_fs *c, unsigned level, enum btree_id btree_id, struct bkey_i *k, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { - int write = flags & BKEY_INVALID_WRITE; + int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); struct printbuf buf = PRINTBUF; int ret = 0; @@ -376,7 +403,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; @@ -385,9 +412,11 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, entry->level, entry->btree_id, k, version, big_endian, - flags|BKEY_INVALID_JOURNAL); + flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; + else if (ret) + return ret; k = bkey_next(k); } @@ -416,7 +445,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct bkey_i *k = entry->start; int ret = 0; @@ -455,7 +484,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { /* obsolete, don't care: */ return 0; @@ -470,7 +499,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { int ret = 0; @@ -497,7 +526,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -539,7 +568,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -573,7 +602,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -617,7 +646,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -657,13 +686,12 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned expected = sizeof(*u); - unsigned dev; int ret = 0; if (journal_entry_err_on(bytes < expected, @@ -675,16 +703,6 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, return ret; } - dev = le32_to_cpu(u->dev); - - if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, version, jset, entry, - journal_entry_dev_usage_bad_dev, - "bad dev")) { - journal_entry_null_range(entry, vstruct_next(entry)); - return ret; - } - if (journal_entry_err_on(u->pad, c, version, jset, entry, journal_entry_dev_usage_bad_pad, @@ -719,7 +737,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return 0; } @@ -737,7 +755,7 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -753,7 +771,7 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -769,7 +787,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -799,7 +817,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -817,7 +835,7 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, @@ -837,7 +855,7 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -863,7 +881,7 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - enum bkey_invalid_flags flags) + enum bch_validate_flags flags) { unsigned version; int ret = 0; @@ -918,7 +936,7 @@ static int jset_validate_early(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); unsigned version; - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) @@ -1057,6 +1075,13 @@ reread: goto err; } + if (le64_to_cpu(j->seq) > ja->highest_seq_found) { + ja->highest_seq_found = le64_to_cpu(j->seq); + ja->cur_idx = bucket; + ja->sectors_free = ca->mi.bucket_size - + bucket_remainder(ca, offset) - sectors; + } + /* * This happens sometimes if we don't have discards on - * when we've partially overwritten a bucket with new @@ -1125,8 +1150,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct journal_replay *r, **_r; - struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; unsigned i; int ret = 0; @@ -1146,47 +1169,6 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) goto err; } - ja->sectors_free = ca->mi.bucket_size; - - mutex_lock(&jlist->lock); - genradix_for_each_reverse(&c->journal_entries, iter, _r) { - r = *_r; - - if (!r) - continue; - - darray_for_each(r->ptrs, i) - if (i->dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, i->sector) + - vstruct_sectors(&r->j, c->block_bits); - - ja->cur_idx = i->bucket; - ja->sectors_free = ca->mi.bucket_size - wrote; - goto found; - } - } -found: - mutex_unlock(&jlist->lock); - - if (ja->bucket_seq[ja->cur_idx] && - ja->sectors_free == ca->mi.bucket_size) { -#if 0 - /* - * Debug code for ZNS support, where we (probably) want to be - * correlated where we stopped in the journal to the zone write - * points: - */ - bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); - bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); - for (i = 0; i < 3; i++) { - unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; - - bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); - } -#endif - ja->sectors_free = 0; - } - /* * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't @@ -1255,7 +1237,7 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + enum bch_validate_flags flags = BCH_VALIDATE_journal; i = *_i; @@ -1366,7 +1348,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" - " next at %s", + " next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1390,7 +1372,7 @@ int bch2_journal_read(struct bch_fs *c, continue; darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); if (!ptr->csum_good) bch_err_dev_offset(ca, ptr->sector, @@ -1400,7 +1382,7 @@ int bch2_journal_read(struct bch_fs *c, } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs.data[0].dev), + bch2_dev_have_ref(c, i->ptrs.data[0].dev), &i->j, i->ptrs.data[0].sector, READ); @@ -1697,6 +1679,13 @@ static CLOSURE_CALLBACK(journal_write_done) mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + /* + * We don't typically trigger journal writes from her - the next journal + * write will be triggered immediately after the previous one is + * allocated, in bch2_journal_write() - but the journal write error path + * is special: + */ + bch2_journal_do_writes(j); spin_unlock(&j->lock); } @@ -1731,10 +1720,8 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct journal_device *ja = &ca->journal; - - if (!percpu_ref_tryget(&ca->io_ref)) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + if (!ca) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); continue; @@ -1743,6 +1730,7 @@ static CLOSURE_CALLBACK(journal_write_submit) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); + struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; @@ -1776,11 +1764,13 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); - closure_wait(&j->async_wait, cl); + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + continue_at(cl, journal_write_preflush, j->wq); + return; + } spin_unlock(&j->lock); - - continue_at(cl, journal_write_preflush, j->wq); - return; } if (w->separate_flush) { @@ -1958,14 +1948,14 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * So if we're in an error state, and we're still starting up, we don't * write anything at all. */ - if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) return -EIO; if (error || w->noflush || (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + test_bit(JOURNAL_may_skip_flush, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; @@ -1976,7 +1966,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + clear_bit(JOURNAL_need_flush_write, &j->flags); } return 0; @@ -1988,7 +1978,6 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; @@ -2032,11 +2021,15 @@ CLOSURE_CALLBACK(bch2_journal_write) } if (ret) { - __bch2_journal_debug_to_text(&journal_debug_buf, j); + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); goto err; } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4f1e763ab5..2ca9cde30e 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -4,6 +4,9 @@ #include "darray.h" +void bch2_journal_pos_from_member_info_set(struct bch_fs *); +void bch2_journal_pos_from_member_info_resume(struct bch_fs *); + struct journal_ptr { bool csum_good; u8 dev; @@ -60,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bkey_invalid_flags); + enum bch_validate_flags); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 04a577848b..79be0eaddf 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -67,7 +67,7 @@ void bch2_journal_set_watermark(struct journal *j) track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); - mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin); + mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -225,9 +225,9 @@ void bch2_journal_space_available(struct journal *j) j->space[journal_space_clean_ondisk].total) && (clean - clean_ondisk <= total / 8) && (clean_ondisk * 2 > clean)) - set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + set_bit(JOURNAL_may_skip_flush, &j->flags); else - clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + clear_bit(JOURNAL_may_skip_flush, &j->flags); bch2_journal_set_watermark(j); out: @@ -818,7 +818,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + ret = !test_bit(JOURNAL_replay_done, &j->flags) || journal_last_seq(j) > seq_to_flush || !fifo_used(&j->pin); @@ -833,7 +833,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) /* time_stats this */ bool did_work = false; - if (!test_bit(JOURNAL_STARTED, &j->flags)) + if (!test_bit(JOURNAL_running, &j->flags)) return false; closure_wait_event(&j->async_wait, diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index ae4fb8c3a2..db80e506e3 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -16,9 +16,8 @@ static int u64_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static int bch2_sb_journal_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); @@ -99,9 +98,8 @@ static int u64_range_cmp(const void *_l, const void *_r) return cmp_int(l->start, r->start); } -static int bch2_sb_journal_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 37a024e034..1f25c111c5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_iter.h" #include "eytzinger.h" +#include "journal.h" #include "journal_seq_blacklist.h" #include "super-io.h" @@ -162,9 +162,8 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); @@ -217,78 +216,40 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .to_text = bch2_sb_journal_seq_blacklist_to_text }; -void bch2_blacklist_entries_gc(struct work_struct *work) +bool bch2_blacklist_entries_gc(struct bch_fs *c) { - struct bch_fs *c = container_of(work, struct bch_fs, - journal_seq_blacklist_gc_work); - struct journal_seq_blacklist_table *t; - struct bch_sb_field_journal_seq_blacklist *bl; struct journal_seq_blacklist_entry *src, *dst; - struct btree_trans *trans = bch2_trans_get(c); - unsigned i, nr, new_nr; - int ret; - - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter iter; - struct btree *b; - - bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, - 0, 0, BTREE_ITER_PREFETCH); -retry: - bch2_trans_begin(trans); - - b = bch2_btree_iter_peek_node(&iter); - - while (!(ret = PTR_ERR_OR_ZERO(b)) && - b && - !test_bit(BCH_FS_stopping, &c->flags)) - b = bch2_btree_iter_next_node(&iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_iter_exit(trans, &iter); - } - - bch2_trans_put(trans); - if (ret) - return; - - mutex_lock(&c->sb_lock); - bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) - goto out; + return false; - nr = blacklist_nr_entries(bl); + unsigned nr = blacklist_nr_entries(bl); dst = bl->start; - t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - for (src = bl->start, i = eytzinger0_first(t->nr); + unsigned i; + for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); - if (t->entries[i].dirty) + if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; } - new_nr = dst - bl->start; - - bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); - - if (new_nr != nr) { - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - new_nr ? sb_blacklist_u64s(new_nr) : 0); - BUG_ON(new_nr && !bl); + unsigned new_nr = dst - bl->start; + if (new_nr == nr) + return false; - if (!new_nr) - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); - bch2_write_super(c); - } -out: - mutex_unlock(&c->sb_lock); + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + return true; } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index afb886ec8e..d47636f96f 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -17,6 +17,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -void bch2_blacklist_entries_gc(struct work_struct *); +bool bch2_blacklist_entries_gc(struct bch_fs *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h new file mode 100644 index 0000000000..2566b12dbc --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist_format.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index b5161b5d76..19183fcf7a 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -129,12 +129,17 @@ enum journal_space_from { journal_space_nr, }; +#define JOURNAL_FLAGS() \ + x(replay_done) \ + x(running) \ + x(may_skip_flush) \ + x(need_flush_write) \ + x(space_low) + enum journal_flags { - JOURNAL_REPLAY_DONE, - JOURNAL_STARTED, - JOURNAL_MAY_SKIP_FLUSH, - JOURNAL_NEED_FLUSH_WRITE, - JOURNAL_SPACE_LOW, +#define x(n) JOURNAL_##n, + JOURNAL_FLAGS() +#undef x }; /* Reasons we may fail to get a journal reservation: */ @@ -229,6 +234,7 @@ struct journal { u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; + u64 oldest_seq_found_ondisk; /* * FIFO of journal entries whose btree updates have not yet been @@ -326,6 +332,7 @@ struct journal_device { /* for bch_journal_read_device */ struct closure read; + u64 highest_seq_found; }; /* diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index b82f820904..f49fdca1d0 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -56,7 +56,7 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_logged_ops, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 26569043e3..b12894ef44 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -11,7 +11,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = { NULL }; +int bch2_lru_check_set(struct btree_trans *trans, + u16 lru_id, u64 time, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter lru_iter; + struct bkey_s_c lru_k = + bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(lru_id, + bucket_to_u64(referring_k.k->p), + time), 0); + int ret = bkey_err(lru_k); + if (ret) + return ret; + + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) + goto err; + + if (fsck_err(c, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n" + " %s", + bch2_lru_types[lru_type(lru_k)], + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, @@ -149,7 +188,7 @@ int bch2_check_lrus(struct bch_fs *c) struct bpos last_flushed_pos = POS_MIN; int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); bch_err_fn(c, ret); diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 429dca816d..ed75bcf59d 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -2,9 +2,6 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -#define LRU_TIME_BITS 48 -#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) - static inline u64 lru_pos_id(struct bpos pos) { return pos.inode >> LRU_TIME_BITS; @@ -49,7 +46,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) } int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); @@ -64,6 +61,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +struct bkey_buf; +int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); + int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 4c298e7472..e9d9c0212e 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = { kunit_test_suite(mean_and_variance_test_suite); MODULE_AUTHOR("Daniel B. Hill"); +MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 69098eeb5d..ddc187fb69 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -49,7 +49,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; - n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -67,7 +67,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, /* * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're * just using the normal key deletion path: */ @@ -87,7 +87,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) continue; ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) @@ -119,7 +119,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) for (id = 0; id < BTREE_ID_NR; id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4d94b7742d..e714e3bd5b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -36,36 +36,6 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:"); - prt_tab(out); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str(out, "kill ptrs: "); - prt_tab(out); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str(out, "target: "); - prt_tab(out); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str(out, "compression: "); - prt_tab(out); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); - prt_newline(out); - - prt_str(out, "extra replicas: "); - prt_tab(out); - prt_u64(out, data_opts->extra_replicas); -} - static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) @@ -421,7 +391,7 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, io_opts->d.nr = 0; ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -467,7 +437,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_CACHED); + BTREE_ITER_cached); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -552,9 +522,10 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } + bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -695,6 +666,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bpos bp_pos = POS_MIN; int ret = 0; + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + if (!ca) + return 0; + trace_bucket_evacuate(c, &bucket); bch2_bkey_buf_init(&sk); @@ -705,7 +680,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); + bucket, BTREE_ITER_cached); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); @@ -716,7 +691,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, a = bch2_alloc_to_v4(k, &a_convert); dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + bucket_size = ca->mi.bucket_size; fragmentation = a->fragmentation_lru; ret = bch2_btree_write_buffer_tryflush(trans); @@ -730,9 +705,9 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, bucket, gen, + ret = bch2_get_next_backpointer(trans, ca, bucket, gen, &bp_pos, &bp, - BTREE_ITER_CACHED); + BTREE_ITER_cached); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -828,6 +803,7 @@ next: trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); err: + bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -868,7 +844,7 @@ static int bch2_move_btree(struct bch_fs *c, continue; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, - BTREE_ITER_PREFETCH); + BTREE_ITER_prefetch); retry: ret = 0; while (bch2_trans_begin(trans), @@ -920,7 +896,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - if (!nr_good || nr_good >= replicas) + rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ptr->cached && + (!ca || !ca->mi.durability)) + data_opts->kill_ptrs |= BIT(i); + i++; + } + rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) return false; data_opts->target = 0; @@ -975,26 +964,10 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg, */ static bool bformat_needs_redo(struct bkey_format *f) { - for (unsigned i = 0; i < f->nr_fields; i++) { - unsigned f_bits = f->bits_per_field[i]; - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (f_bits > unpacked_bits) + for (unsigned i = 0; i < f->nr_fields; i++) + if (bch2_bkey_format_field_overflows(f, i)) return true; - if ((f_bits == unpacked_bits) && field_offset) - return true; - - u64 f_mask = f_bits - ? ~((~0ULL << (f_bits - 1)) << 1) - : 0; - - if (((field_offset + f_mask) & unpacked_mask) < field_offset) - return true; - } - return false; } @@ -1049,6 +1022,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1059,6 +1033,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } + rcu_read_unlock(); return data_opts->kill_ptrs != 0; } @@ -1143,23 +1118,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "keys moved: "); - prt_u64(out, atomic64_read(&stats->keys_moved)); - prt_newline(out); - - prt_str(out, "keys raced: "); - prt_u64(out, atomic64_read(&stats->keys_raced)); - prt_newline(out); - - prt_str(out, "bytes seen: "); + prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_str(out, "bytes moved: "); + prt_printf(out, "bytes moved: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_str(out, "bytes raced: "); + prt_printf(out, "bytes raced: "); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1173,19 +1142,17 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); - prt_printf(out, "reads: ios %u/%u sectors %u/%u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->read_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->read_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); - prt_printf(out, "writes: ios %u/%u sectors %u/%u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->write_ios), c->opts.move_ios_in_flight, atomic_read(&ctxt->write_sectors), c->opts.move_bytes_in_flight >> 9); - prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0d2b82d8d1..eb49dd045e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -35,9 +35,10 @@ struct buckets_in_flight { }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), - .key_len = sizeof(struct move_bucket_key), + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, }; static struct move_bucket_in_flight * @@ -84,7 +85,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return 0; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_CACHED); + b->k.bucket, BTREE_ITER_cached); ret = bkey_err(k); if (ret) return ret; @@ -158,6 +159,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; + bch2_trans_begin(trans); + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 84e452835a..b197ec90d4 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -137,7 +137,7 @@ enum fsck_err_opts { x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -426,11 +426,6 @@ enum fsck_err_opts { BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ - x(buckets_nouse, u8, \ - 0, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allocate the buckets_nouse bitmap") \ x(stdio, u64, \ 0, \ OPT_UINT(0, S64_MAX), \ @@ -480,7 +475,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index b27d229259..9f529e4c1b 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -10,35 +10,57 @@ #include "printbuf.h" -static inline unsigned printbuf_linelen(struct printbuf *buf) +static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos) { - return buf->pos - buf->last_newline; + return pos - buf->last_newline; } -int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +static inline unsigned printbuf_linelen(struct printbuf *buf) { - unsigned new_size; - char *buf; + return __printbuf_linelen(buf, buf->pos); +} - if (!out->heap_allocated) - return 0; +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ /* Reserved space for terminating nul: */ extra += 1; - if (out->pos + extra < out->size) + if (out->pos + extra <= out->size) return 0; - new_size = roundup_pow_of_two(out->size + extra); + if (!out->heap_allocated) { + out->overflow = true; + return 0; + } + + unsigned new_size = roundup_pow_of_two(out->size + extra); + + /* Sanity check... */ + if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) { + out->allocation_failure = true; + out->overflow = true; + return -ENOMEM; + } /* * Note: output buffer must be freeable with kfree(), it's not required * that the user use printbuf_exit(). */ - buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); if (!buf) { out->allocation_failure = true; + out->overflow = true; return -ENOMEM; } @@ -47,6 +69,92 @@ int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) return 0; } +static void printbuf_advance_pos(struct printbuf *out, unsigned len) +{ + out->pos += min(len, printbuf_remaining(out)); +} + +static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr) +{ + unsigned move = out->pos - pos; + + bch2_printbuf_make_room(out, nr); + + if (pos + nr < out->size) + memmove(out->buf + pos + nr, + out->buf + pos, + min(move, out->size - 1 - pos - nr)); + + if (pos < out->size) + memset(out->buf + pos, ' ', min(nr, out->size - pos)); + + printbuf_advance_pos(out, nr); + printbuf_nul_terminate_reserved(out); +} + +static void __printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + while (true) { + int pad; + unsigned len = out->pos - pos; + char *p = out->buf + pos; + char *n = memscan(p, '\n', len); + if (cur_tabstop(out)) { + n = min(n, (char *) memscan(p, '\r', len)); + n = min(n, (char *) memscan(p, '\t', len)); + } + + pos = n - out->buf; + if (pos == out->pos) + break; + + switch (*n) { + case '\n': + pos++; + out->last_newline = pos; + + printbuf_insert_spaces(out, pos, out->indent); + + pos = min(pos + out->indent, out->pos); + out->last_field = pos; + out->cur_tabstop = 0; + break; + case '\r': + memmove(n, n + 1, out->pos - pos); + --out->pos; + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos); + if (pad > 0) { + printbuf_insert_spaces(out, out->last_field, pad); + pos += pad; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + case '\t': + pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1; + if (pad > 0) { + *n = ' '; + printbuf_insert_spaces(out, pos, pad - 1); + pos += pad; + } else { + memmove(n, n + 1, out->pos - pos); + --out->pos; + } + + out->last_field = pos; + out->cur_tabstop++; + break; + } + } +} + +static inline void printbuf_do_indent(struct printbuf *out, unsigned pos) +{ + if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling) + __printbuf_do_indent(out, pos); +} + void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) { int len; @@ -55,14 +163,14 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_list args2; va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2); va_end(args2); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) @@ -72,14 +180,14 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) do { va_start(args, fmt); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args); va_end(args); - } while (len + 1 >= printbuf_remaining(out) && - !bch2_printbuf_make_room(out, len + 1)); + } while (len > printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len)); - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; + unsigned indent_pos = out->pos; + printbuf_advance_pos(out, len); + printbuf_do_indent(out, indent_pos); } /** @@ -194,33 +302,20 @@ void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) void bch2_prt_newline(struct printbuf *buf) { - unsigned i; - bch2_printbuf_make_room(buf, 1 + buf->indent); - __prt_char(buf, '\n'); + __prt_char_reserved(buf, '\n'); buf->last_newline = buf->pos; - for (i = 0; i < buf->indent; i++) - __prt_char(buf, ' '); + __prt_chars_reserved(buf, ' ', buf->indent); - printbuf_nul_terminate(buf); + printbuf_nul_terminate_reserved(buf); buf->last_field = buf->pos; buf->cur_tabstop = 0; } -/* - * Returns spaces from start of line, if set, or 0 if unset: - */ -static inline unsigned cur_tabstop(struct printbuf *buf) -{ - return buf->cur_tabstop < buf->nr_tabstops - ? buf->_tabstops[buf->cur_tabstop] - : 0; -} - static void __prt_tab(struct printbuf *out) { int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); @@ -247,24 +342,9 @@ void bch2_prt_tab(struct printbuf *out) static void __prt_tab_rjust(struct printbuf *buf) { - unsigned move = buf->pos - buf->last_field; int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); - - if (pad > 0) { - bch2_printbuf_make_room(buf, pad); - - if (buf->last_field + pad < buf->size) - memmove(buf->buf + buf->last_field + pad, - buf->buf + buf->last_field, - min(move, buf->size - 1 - buf->last_field - pad)); - - if (buf->last_field < buf->size) - memset(buf->buf + buf->last_field, ' ', - min((unsigned) pad, buf->size - buf->last_field)); - - buf->pos += pad; - printbuf_nul_terminate(buf); - } + if (pad > 0) + printbuf_insert_spaces(buf, buf->last_field, pad); buf->last_field = buf->pos; buf->cur_tabstop++; @@ -301,41 +381,9 @@ void bch2_prt_tab_rjust(struct printbuf *buf) */ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { - const char *unprinted_start = str; - const char *end = str + count; - - if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { - prt_bytes(out, str, count); - return; - } - - while (str != end) { - switch (*str) { - case '\n': - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - bch2_prt_newline(out); - break; - case '\t': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab(out); - } - break; - case '\r': - if (likely(cur_tabstop(out))) { - prt_bytes(out, unprinted_start, str - unprinted_start); - unprinted_start = str + 1; - __prt_tab_rjust(out); - } - break; - } - - str++; - } - - prt_bytes(out, unprinted_start, str - unprinted_start); + unsigned indent_pos = out->pos; + prt_bytes(out, str, count); + printbuf_do_indent(out, indent_pos); } /** @@ -348,9 +396,10 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) { bch2_printbuf_make_room(out, 10); - out->pos += string_get_size(v, 1, !out->si_units, - out->buf + out->pos, - printbuf_remaining_size(out)); + unsigned len = string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); + printbuf_advance_pos(out, len); } /** @@ -402,9 +451,7 @@ void bch2_prt_string_option(struct printbuf *out, const char * const list[], size_t selected) { - size_t i; - - for (i = 0; list[i]; i++) + for (size_t i = 0; list[i]; i++) bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); } diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 9a4a56c409..9ecc56bc96 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -86,6 +86,7 @@ struct printbuf { u8 atomic; bool allocation_failure:1; bool heap_allocated:1; + bool overflow:1; enum printbuf_si si_units:1; bool human_readable_units:1; bool has_indent_or_tabstops:1; @@ -142,7 +143,9 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], */ static inline unsigned printbuf_remaining_size(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos : 0; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + return out->size - out->pos; } /* @@ -151,7 +154,7 @@ static inline unsigned printbuf_remaining_size(struct printbuf *out) */ static inline unsigned printbuf_remaining(struct printbuf *out) { - return out->pos < out->size ? out->size - out->pos - 1 : 0; + return out->size ? printbuf_remaining_size(out) - 1 : 0; } static inline unsigned printbuf_written(struct printbuf *out) @@ -159,30 +162,25 @@ static inline unsigned printbuf_written(struct printbuf *out) return out->size ? min(out->pos, out->size - 1) : 0; } -/* - * Returns true if output was truncated: - */ -static inline bool printbuf_overflowed(struct printbuf *out) +static inline void printbuf_nul_terminate_reserved(struct printbuf *out) { - return out->pos >= out->size; + if (WARN_ON(out->size && out->pos >= out->size)) + out->pos = out->size - 1; + if (out->size) + out->buf[out->pos] = 0; } static inline void printbuf_nul_terminate(struct printbuf *out) { bch2_printbuf_make_room(out, 1); - - if (out->pos < out->size) - out->buf[out->pos] = 0; - else if (out->size) - out->buf[out->size - 1] = 0; + printbuf_nul_terminate_reserved(out); } /* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) - out->buf[out->pos] = c; - out->pos++; + out->buf[out->pos++] = c; } /* Doesn't nul terminate: */ @@ -194,37 +192,34 @@ static inline void __prt_char(struct printbuf *out, char c) static inline void prt_char(struct printbuf *out, char c) { - __prt_char(out, c); - printbuf_nul_terminate(out); + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, c); + printbuf_nul_terminate_reserved(out); } static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) { - unsigned i, can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = c; - out->pos += n - can_print; } static inline void prt_chars(struct printbuf *out, char c, unsigned n) { bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { - unsigned i, can_print; - bch2_printbuf_make_room(out, n); - can_print = min(n, printbuf_remaining(out)); + unsigned can_print = min(n, printbuf_remaining(out)); - for (i = 0; i < can_print; i++) + for (unsigned i = 0; i < can_print; i++) out->buf[out->pos++] = ((char *) b)[i]; - out->pos += n - can_print; printbuf_nul_terminate(out); } @@ -241,18 +236,18 @@ static inline void prt_str_indented(struct printbuf *out, const char *str) static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - bch2_printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 3); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); - printbuf_nul_terminate(out); + printbuf_nul_terminate_reserved(out); } /** diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 556da07381..a0cca8b70e 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -20,7 +20,7 @@ static const char * const bch2_quota_counters[] = { }; static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); @@ -60,8 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -97,45 +96,14 @@ static void qc_info_to_text(struct printbuf *out, struct qc_info *i) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "i_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", i->i_fieldmask); - prt_newline(out); - - prt_str(out, "i_flags"); - prt_tab(out); - prt_printf(out, "%u", i->i_flags); - prt_newline(out); - - prt_str(out, "i_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_ino_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_timelimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_timelimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_timelimit); - prt_newline(out); - - prt_str(out, "i_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_spc_warnlimit); - prt_newline(out); - - prt_str(out, "i_ino_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_ino_warnlimit); - prt_newline(out); - - prt_str(out, "i_rt_spc_warnlimit"); - prt_tab(out); - prt_printf(out, "%u", i->i_rt_spc_warnlimit); - prt_newline(out); + prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask); + prt_printf(out, "i_flags\t%u\n", i->i_flags); + prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit); + prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit); + prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit); + prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit); + prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit); + prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit); } static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) @@ -143,60 +111,17 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 20); - prt_str(out, "d_fieldmask"); - prt_tab(out); - prt_printf(out, "%x", q->d_fieldmask); - prt_newline(out); - - prt_str(out, "d_spc_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_hardlimit); - prt_newline(out); - - prt_str(out, "d_spc_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_softlimit); - prt_newline(out); - - prt_str(out, "d_ino_hardlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_hardlimit); - prt_newline(out); - - prt_str(out, "d_ino_softlimit"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_softlimit); - prt_newline(out); - - prt_str(out, "d_space"); - prt_tab(out); - prt_printf(out, "%llu", q->d_space); - prt_newline(out); - - prt_str(out, "d_ino_count"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_count); - prt_newline(out); - - prt_str(out, "d_ino_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_ino_timer); - prt_newline(out); - - prt_str(out, "d_spc_timer"); - prt_tab(out); - prt_printf(out, "%llu", q->d_spc_timer); - prt_newline(out); - - prt_str(out, "d_ino_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_ino_warns); - prt_newline(out); - - prt_str(out, "d_spc_warns"); - prt_tab(out); - prt_printf(out, "%i", q->d_spc_warns); - prt_newline(out); + prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask); + prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit); + prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit); + prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit); + prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit); + prt_printf(out, "d_space\t%llu\n", q->d_space); + prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count); + prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer); + prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer); + prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns); + prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns); } static inline unsigned __next_qtype(unsigned i, unsigned qtypes) @@ -610,10 +535,10 @@ int bch2_fs_quota_read(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, __bch2_quota_set(c, k, NULL)) ?: for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, bch2_fs_quota_read_inode(trans, &iter, k))); bch_err_fn(c, ret); return ret; @@ -900,7 +825,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, int ret; k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_slots|BTREE_ITER_intent); ret = bkey_err(k); if (unlikely(ret)) return ret; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 884f601f41..02d37a3322 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,11 +5,11 @@ #include "inode.h" #include "quota_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 56336f3dd1..cf81e5128c 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -42,7 +42,7 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -89,7 +89,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -140,7 +140,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; @@ -323,12 +323,14 @@ static int do_rebalance(struct moving_context *ctxt) struct bkey_s_c k; int ret = 0; + bch2_trans_begin(trans); + bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); bch2_trans_iter_init(trans, &rebalance_work_iter, BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { if (!r->enabled) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8091d06860..1f9d044ed9 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -35,6 +35,9 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { + if (btree >= BTREE_ID_NR_MAX) + return; + u64 b = BIT_ULL(btree); if (!(c->sb.btrees_lost_data & b)) { @@ -65,9 +68,20 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent); + + __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); @@ -125,9 +139,9 @@ static int bch2_journal_replay_key(struct btree_trans *trans, { struct btree_iter iter; unsigned iter_flags = - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS; - unsigned update_flags = BTREE_TRIGGER_NORUN; + BTREE_ITER_intent| + BTREE_ITER_not_extents; + unsigned update_flags = BTREE_TRIGGER_norun; int ret; if (k->overwritten) @@ -136,17 +150,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, trans->journal_res.seq = k->journal_seq; /* - * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing * besides the allocator is doing updates yet so we don't need key cache * coherency for non-alloc btrees, and key cache fills for snapshots - * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * btrees use BTREE_ITER_filter_snapshots, which isn't available until * the snapshots recovery pass runs. */ if (!k->level && k->btree_id == BTREE_ID_alloc) - iter_flags |= BTREE_ITER_CACHED; + iter_flags |= BTREE_ITER_cached; else - update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + update_flags |= BTREE_UPDATE_key_cache_reclaim; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -191,7 +205,7 @@ int bch2_journal_replay(struct bch_fs *c) struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - struct btree_trans *trans = bch2_trans_get(c); + struct btree_trans *trans = NULL; bool immediate_flush = false; int ret = 0; @@ -205,6 +219,7 @@ int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); move_gap(keys, keys->nr); + trans = bch2_trans_get(c); /* * First, attempt to replay keys in sorted order. This is more @@ -311,6 +326,12 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, + "invalid btree id %u (max %u)", + entry->btree_id, BTREE_ID_NR_MAX)) + return 0; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); if (ret) @@ -361,14 +382,17 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_dev_usage: { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); - struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { - ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); - ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); - ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); - } + unsigned nr_types = jset_entry_dev_usage_nr_types(u); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev)); + if (ca) + for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + rcu_read_unlock(); break; } @@ -397,7 +421,7 @@ static int journal_replay_entry_early(struct bch_fs *c, atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } - +fsck_err: return ret; } @@ -597,56 +621,54 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.norecovery) c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; - if (!c->opts.nochanges) { - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + bool write_sb = false; - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { - ext->recovery_passes_required[0] |= - cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); - write_sb = true; - } + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + write_sb = true; + } - u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - if (sb_passes) { - struct printbuf buf = PRINTBUF; - prt_str(&buf, "superblock requires following recovery passes to be run:\n "); - prt_bitflags(&buf, bch2_recovery_passes, sb_passes); - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + struct printbuf buf = PRINTBUF; + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } - if (bch2_check_version_downgrade(c)) { - struct printbuf buf = PRINTBUF; + if (bch2_check_version_downgrade(c)) { + struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:"); + prt_str(&buf, "Version downgrade required:"); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&buf, "\n running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); - } - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - write_sb = true; + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&buf, "\n running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - if (check_version_upgrade(c)) - write_sb = true; + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + write_sb = true; + } - if (write_sb) - bch2_write_super(c); + if (check_version_upgrade(c)) + write_sb = true; - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - mutex_unlock(&c->sb_lock); - } + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); @@ -660,7 +682,9 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { + bch2_journal_pos_from_member_info_resume(c); + + if (!c->sb.clean || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -793,9 +817,11 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) && + bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); } /* If we fixed errors, verify that fs is actually clean now: */ @@ -832,8 +858,8 @@ use_clean: } mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - bool write_sb = false; + ext = bch2_sb_field_get(c->disk_sb.sb, ext); + write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); @@ -868,6 +894,9 @@ use_clean: write_sb = true; } + if (bch2_blacklist_entries_gc(c)) + write_sb = true; + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -890,10 +919,6 @@ use_clean: bch_info(c, "scanning for old btree nodes done"); } - if (c->journal_seq_blacklist_table && - c->journal_seq_blacklist_table->nr > 128) - queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); - ret = 0; out: bch2_flush_fsck_errs(c); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0cec0f7d97..4a9eb9582b 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -26,11 +26,6 @@ const char * const bch2_recovery_passes[] = { NULL }; -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, false); -} - static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -227,7 +222,8 @@ int bch2_run_recovery_passes(struct bch_fs *c) if (should_run_recovery_pass(c, c->curr_recovery_pass)) { unsigned pass = c->curr_recovery_pass; - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: + bch2_journal_flush(&c->journal); if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || (ret && c->curr_recovery_pass < pass)) continue; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index ff7864731a..9ac6cf21cf 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -30,7 +30,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -74,20 +74,20 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } static int trans_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i *k; __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); + BTREE_ITER_with_updates); ret = PTR_ERR_OR_ZERO(k); if (ret) goto err; @@ -102,7 +102,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); bch2_trans_inconsistent(trans, "indirect extent refcount underflow at %llu while marking\n %s", @@ -111,7 +111,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, goto err; } - if (flags & BTREE_TRIGGER_INSERT) { + if (flags & BTREE_TRIGGER_insert) { struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; u64 pad; @@ -141,12 +141,13 @@ err: } static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags, size_t r_idx) + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags, + size_t r_idx) { struct bch_fs *c = trans->c; struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; u64 start = le64_to_cpu(p.v->idx); u64 end = le64_to_cpu(p.v->idx) + p.k->size; u64 next_idx = end + le32_to_cpu(p.v->back_pad); @@ -163,10 +164,13 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, BUG_ON((s64) r->refcount + add < 0); - r->refcount += add; + if (flags & BTREE_TRIGGER_gc) + r->refcount += add; *idx = r->offset; return 0; not_found: + BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); + if (fsck_err(c, reflink_p_to_missing_reflink_v, "pointer to missing indirect extent\n" " %s\n" @@ -189,7 +193,7 @@ not_found: set_bkey_val_u64s(&update->k, 0); } - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); } *idx = next_idx; @@ -200,8 +204,8 @@ fsck_err: } static int __trigger_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) + enum btree_id btree_id, unsigned level, struct bkey_s_c k, + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); @@ -210,12 +214,12 @@ static int __trigger_reflink_p(struct btree_trans *trans, u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); } - if (flags & BTREE_TRIGGER_GC) { + if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) { size_t l = 0, r = c->reflink_gc_nr; while (l < r) { @@ -238,10 +242,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) { + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) { struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; v->front_pad = v->back_pad = 0; @@ -253,7 +257,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return bch2_bkey_ptrs_invalid(c, k, flags, err); @@ -281,23 +285,25 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) +static inline void +check_indirect_extent_deleting(struct bkey_s new, + enum btree_iter_update_trigger_flags *flags) { - if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) { new.k->type = KEY_TYPE_deleted; new.k->size = 0; set_bkey_val_u64s(new.k, 0); - *flags &= ~BTREE_TRIGGER_INSERT; + *flags &= ~BTREE_TRIGGER_insert; } } int bch2_trigger_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && - (flags & BTREE_TRIGGER_INSERT)) + if ((flags & BTREE_TRIGGER_transactional) && + (flags & BTREE_TRIGGER_insert)) check_indirect_extent_deleting(new, &flags); return bch2_trigger_extent(trans, btree_id, level, old, new, flags); @@ -306,7 +312,7 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, /* indirect inline data */ int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { return 0; @@ -326,7 +332,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { check_indirect_extent_deleting(new, &flags); @@ -349,7 +355,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_prev(&reflink_iter); ret = bkey_err(k); if (ret) @@ -394,7 +400,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); err: bch2_trans_iter_exit(trans, &reflink_iter); @@ -455,9 +461,9 @@ s64 bch2_remap_range(struct bch_fs *c, goto err; bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + BTREE_ITER_intent); while ((ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) && @@ -567,7 +573,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(trans); ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, - dst_inum, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_intent); if (!ret2 && inode_u.bi_size < new_i_size) { diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 4d88672897..e894f3a2c6 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,15 +2,16 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ @@ -21,11 +22,12 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, }) int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ @@ -36,13 +38,13 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, - unsigned); + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 678b9c20e2..57a1f09cca 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -23,14 +23,12 @@ static int bch2_memcmp(const void *l, const void *r, const void *priv) static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG - unsigned i; - BUG_ON(e->data_type >= BCH_DATA_NR); BUG_ON(!e->nr_devs); BUG_ON(e->nr_required > 1 && e->nr_required >= e->nr_devs); - for (i = 0; i + 1 < e->nr_devs; i++) + for (unsigned i = 0; i + 1 < e->nr_devs; i++) BUG_ON(e->devs[i] >= e->devs[i + 1]); #endif } @@ -84,7 +82,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, } for (unsigned i = 0; i < r->nr_devs; i++) - if (!bch2_dev_exists(sb, r->devs[i])) { + if (!bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } @@ -192,24 +190,17 @@ cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, struct bch_replicas_entry_v1 *new_entry) { - unsigned i; struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, replicas_entry_bytes(new_entry)), }; - for (i = 0; i < new_entry->nr_devs; i++) - BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); - - BUG_ON(!new_entry->data_type); - verify_replicas_entry(new_entry); - new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; - for (i = 0; i < old->nr; i++) + for (unsigned i = 0; i < old->nr; i++) memcpy(cpu_replicas_entry(&new, i), cpu_replicas_entry(old, i), old->entry_size); @@ -230,8 +221,6 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, if (unlikely(entry_size > r->entry_size)) return -1; - verify_replicas_entry(search); - #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); @@ -524,13 +513,16 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.nr = 0; c->replicas_gc.entry_size = 0; - for_each_cpu_replicas_entry(&c->replicas, e) - if (!((1 << e->data_type) & typemask)) { + for_each_cpu_replicas_entry(&c->replicas, e) { + /* Preserve unknown data types */ + if (e->data_type >= BCH_DATA_NR || + !((1 << e->data_type) & typemask)) { c->replicas_gc.nr++; c->replicas_gc.entry_size = max_t(unsigned, c->replicas_gc.entry_size, replicas_entry_bytes(e)); } + } c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, @@ -542,7 +534,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) } for_each_cpu_replicas_entry(&c->replicas, e) - if (!((1 << e->data_type) & typemask)) + if (e->data_type >= BCH_DATA_NR || + !((1 << e->data_type) & typemask)) memcpy(cpu_replicas_entry(&c->replicas_gc, i++), e, c->replicas_gc.entry_size); @@ -860,7 +853,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, } static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_replicas_cpu cpu_r; @@ -899,7 +892,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { }; static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_cpu cpu_r; @@ -947,18 +940,20 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; if (e->data_type == BCH_DATA_cached) continue; - for (i = 0; i < e->nr_devs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); - + rcu_read_lock(); + for (unsigned i = 0; i < e->nr_devs; i++) { nr_online += test_bit(e->devs[i], devs.d); - nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + + struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); + nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; } + rcu_read_unlock(); if (nr_failed == e->nr_devs) continue; @@ -996,7 +991,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; struct bch_sb_field_replicas_v0 *replicas_v0; - unsigned i, data_has = 0; + unsigned data_has = 0; replicas = bch2_sb_field_get(sb, replicas); replicas_v0 = bch2_sb_field_get(sb, replicas_v0); @@ -1004,17 +999,26 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) if (replicas) { struct bch_replicas_entry_v1 *r; - for_each_replicas_entry(replicas, r) - for (i = 0; i < r->nr_devs; i++) + for_each_replicas_entry(replicas, r) { + if (r->data_type >= sizeof(data_has) * 8) + continue; + + for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; + } + } else if (replicas_v0) { struct bch_replicas_entry_v0 *r; - for_each_replicas_entry_v0(replicas_v0, r) - for (i = 0; i < r->nr_devs; i++) + for_each_replicas_entry_v0(replicas_v0, r) { + if (r->data_type >= sizeof(data_has) * 8) + continue; + + for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; + } } diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h new file mode 100644 index 0000000000..b97208195d --- /dev/null +++ b/fs/bcachefs/replicas_format.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_FORMAT_H +#define _BCACHEFS_REPLICAS_FORMAT_H + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry_v1 { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry_v1 entries[]; +} __packed __aligned(8); + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 194e55b111..47f10ab57f 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -266,9 +266,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } } -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); @@ -283,7 +282,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, entry = vstruct_next(entry)) { if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) { prt_str(err, "entry type "); - bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type)); + bch2_prt_jset_entry_type(err, entry->type); prt_str(err, " overruns end of section"); return -BCH_ERR_invalid_sb_clean; } @@ -298,10 +297,8 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_clean *clean = field_to_type(f, clean); struct jset_entry *entry; - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); + prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags)); + prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq)); for (entry = clean->start; entry != vstruct_end(&clean->field); @@ -392,6 +389,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) goto out; } + bch2_journal_pos_from_member_info_set(c); + bch2_write_super(c); out: mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 7dc898761b..6992e74691 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -20,9 +20,8 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; }; -static int bch2_sb_counters_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { return 0; }; @@ -31,19 +30,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (i = 0; i < nr; i++) { - if (i < BCH_COUNTER_NR) - prt_printf(out, "%s ", bch2_counter_names[i]); - else - prt_printf(out, "(unknown)"); - - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); - prt_newline(out); - } + for (unsigned i = 0; i < nr; i++) + prt_printf(out, "%s \t%llu\n", + i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", + le64_to_cpu(ctrs->d[i])); }; int bch2_sb_counters_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index ed6d298bc1..4710b61631 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -139,7 +139,7 @@ downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) _i = downgrade_entry_next_c(_i)) static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); @@ -154,6 +154,12 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, if ((void *) &i->errors[0] > vstruct_end(&e->field)) break; + if (flags & BCH_VALIDATE_write && + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { + prt_printf(err, "downgrade entry overruns end of superblock section"); + return -BCH_ERR_invalid_sb_downgrade; + } + if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", @@ -175,19 +181,16 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 16); for_each_downgrade_entry(e, i) { - prt_str(out, "version:"); - prt_tab(out); + prt_str(out, "version:\t"); bch2_version_to_text(out, le16_to_cpu(i->version)); prt_newline(out); - prt_str(out, "recovery passes:"); - prt_tab(out); + prt_str(out, "recovery passes:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); prt_newline(out); - prt_str(out, "errors:"); - prt_tab(out); + prt_str(out, "errors:\t"); bool first = true; for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { if (!first) diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h new file mode 100644 index 0000000000..cffd932be3 --- /dev/null +++ b/fs/bcachefs/sb-downgrade_format.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H +#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + +#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 5f5bcae391..c1270d790e 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -30,7 +30,7 @@ static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) } static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_errors *e = field_to_type(f, errors); unsigned i, nr = bch2_sb_field_errors_nr_entries(e); @@ -110,19 +110,25 @@ out: void bch2_sb_errors_from_cpu(struct bch_fs *c) { bch_sb_errors_cpu *src = &c->fsck_error_counts; - struct bch_sb_field_errors *dst = - bch2_sb_field_resize(&c->disk_sb, errors, - bch2_sb_field_errors_u64s(src->nr)); + struct bch_sb_field_errors *dst; unsigned i; + mutex_lock(&c->fsck_error_counts_lock); + + dst = bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + if (!dst) - return; + goto err; for (i = 0; i < src->nr; i++) { SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); } + +err: + mutex_unlock(&c->fsck_error_counts_lock); } static int bch2_sb_errors_to_cpu(struct bch_fs *c) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h new file mode 100644 index 0000000000..d54121ec09 --- /dev/null +++ b/fs/bcachefs/sb-errors_format.h @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H +#define _BCACHEFS_SB_ERRORS_FORMAT_H + +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, + FSCK_AUTOFIX = 1 << 4, +}; + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0, 0) \ + x(dirty_but_no_journal_entries, 1, 0) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \ + x(sb_clean_journal_seq_mismatch, 3, 0) \ + x(sb_clean_btree_root_mismatch, 4, 0) \ + x(sb_clean_missing, 5, 0) \ + x(jset_unsupported_version, 6, 0) \ + x(jset_unknown_csum, 7, 0) \ + x(jset_last_seq_newer_than_seq, 8, 0) \ + x(jset_past_bucket_end, 9, 0) \ + x(jset_seq_blacklisted, 10, 0) \ + x(journal_entries_missing, 11, 0) \ + x(journal_entry_replicas_not_marked, 12, 0) \ + x(journal_entry_past_jset_end, 13, 0) \ + x(journal_entry_replicas_data_mismatch, 14, 0) \ + x(journal_entry_bkey_u64s_0, 15, 0) \ + x(journal_entry_bkey_past_end, 16, 0) \ + x(journal_entry_bkey_bad_format, 17, 0) \ + x(journal_entry_bkey_invalid, 18, 0) \ + x(journal_entry_btree_root_bad_size, 19, 0) \ + x(journal_entry_blacklist_bad_size, 20, 0) \ + x(journal_entry_blacklist_v2_bad_size, 21, 0) \ + x(journal_entry_blacklist_v2_start_past_end, 22, 0) \ + x(journal_entry_usage_bad_size, 23, 0) \ + x(journal_entry_data_usage_bad_size, 24, 0) \ + x(journal_entry_clock_bad_size, 25, 0) \ + x(journal_entry_clock_bad_rw, 26, 0) \ + x(journal_entry_dev_usage_bad_size, 27, 0) \ + x(journal_entry_dev_usage_bad_dev, 28, 0) \ + x(journal_entry_dev_usage_bad_pad, 29, 0) \ + x(btree_node_unreadable, 30, 0) \ + x(btree_node_fault_injected, 31, 0) \ + x(btree_node_bad_magic, 32, 0) \ + x(btree_node_bad_seq, 33, 0) \ + x(btree_node_unsupported_version, 34, 0) \ + x(btree_node_bset_older_than_sb_min, 35, 0) \ + x(btree_node_bset_newer_than_sb, 36, 0) \ + x(btree_node_data_missing, 37, 0) \ + x(btree_node_bset_after_end, 38, 0) \ + x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ + x(btree_node_replicas_data_mismatch, 40, 0) \ + x(bset_unknown_csum, 41, 0) \ + x(bset_bad_csum, 42, 0) \ + x(bset_past_end_of_btree_node, 43, 0) \ + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ + x(bset_blacklisted_journal_seq, 47, 0) \ + x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ + x(btree_node_bad_max_key, 52, 0) \ + x(btree_node_bad_format, 53, 0) \ + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ + x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_root_bkey_invalid, 58, 0) \ + x(btree_root_read_error, 59, 0) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ + x(btree_node_read_error, 62, 0) \ + x(btree_node_topology_bad_min_key, 63, 0) \ + x(btree_node_topology_bad_max_key, 64, 0) \ + x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ + x(btree_node_topology_overwritten_by_next_node, 66, 0) \ + x(btree_node_topology_interior_node_empty, 67, 0) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ + x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \ + x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \ + x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \ + x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \ + x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \ + x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \ + x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \ + x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \ + x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \ + x(bkey_version_in_future, 80, 0) \ + x(bkey_u64s_too_small, 81, 0) \ + x(bkey_invalid_type_for_btree, 82, 0) \ + x(bkey_extent_size_zero, 83, 0) \ + x(bkey_extent_size_greater_than_offset, 84, 0) \ + x(bkey_size_nonzero, 85, 0) \ + x(bkey_snapshot_nonzero, 86, 0) \ + x(bkey_snapshot_zero, 87, 0) \ + x(bkey_at_pos_max, 88, 0) \ + x(bkey_before_start_of_btree_node, 89, 0) \ + x(bkey_after_end_of_btree_node, 90, 0) \ + x(bkey_val_size_nonzero, 91, 0) \ + x(bkey_val_size_too_small, 92, 0) \ + x(alloc_v1_val_size_bad, 93, 0) \ + x(alloc_v2_unpack_error, 94, 0) \ + x(alloc_v3_unpack_error, 95, 0) \ + x(alloc_v4_val_size_bad, 96, 0) \ + x(alloc_v4_backpointers_start_bad, 97, 0) \ + x(alloc_key_data_type_bad, 98, 0) \ + x(alloc_key_empty_but_have_data, 99, 0) \ + x(alloc_key_dirty_sectors_0, 100, 0) \ + x(alloc_key_data_type_inconsistency, 101, 0) \ + x(alloc_key_to_missing_dev_bucket, 102, 0) \ + x(alloc_key_cached_inconsistency, 103, 0) \ + x(alloc_key_cached_but_read_time_zero, 104, 0) \ + x(alloc_key_to_missing_lru_entry, 105, 0) \ + x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \ + x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \ + x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \ + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ + x(need_discard_key_wrong, 114, 0) \ + x(freespace_key_wrong, 115, 0) \ + x(freespace_hole_missing, 116, 0) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, 0) \ + x(bucket_gens_hole_wrong, 119, 0) \ + x(bucket_gens_to_invalid_dev, 120, 0) \ + x(bucket_gens_to_invalid_buckets, 121, 0) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ + x(need_discard_freespace_key_bad, 124, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_to_missing_device, 126, 0) \ + x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_ptr, 128, 0) \ + x(lru_entry_at_time_0, 129, 0) \ + x(lru_entry_to_invalid_bucket, 130, 0) \ + x(lru_entry_bad, 131, 0) \ + x(btree_ptr_val_too_big, 132, 0) \ + x(btree_ptr_v2_val_too_big, 133, 0) \ + x(btree_ptr_has_non_ptr, 134, 0) \ + x(extent_ptrs_invalid_entry, 135, 0) \ + x(extent_ptrs_no_ptrs, 136, 0) \ + x(extent_ptrs_too_many_ptrs, 137, 0) \ + x(extent_ptrs_redundant_crc, 138, 0) \ + x(extent_ptrs_redundant_stripe, 139, 0) \ + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ + x(ptr_spans_multiple_buckets, 146, 0) \ + x(ptr_to_missing_backpointer, 147, 0) \ + x(ptr_to_missing_alloc_key, 148, 0) \ + x(ptr_to_missing_replicas_entry, 149, 0) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, 0) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ + x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(stripe_pos_bad, 167, 0) \ + x(stripe_val_size_bad, 168, 0) \ + x(stripe_sector_count_wrong, 169, 0) \ + x(snapshot_tree_pos_bad, 170, 0) \ + x(snapshot_tree_to_missing_snapshot, 171, 0) \ + x(snapshot_tree_to_missing_subvol, 172, 0) \ + x(snapshot_tree_to_wrong_subvol, 173, 0) \ + x(snapshot_tree_to_snapshot_subvol, 174, 0) \ + x(snapshot_pos_bad, 175, 0) \ + x(snapshot_parent_bad, 176, 0) \ + x(snapshot_children_not_normalized, 177, 0) \ + x(snapshot_child_duplicate, 178, 0) \ + x(snapshot_child_bad, 179, 0) \ + x(snapshot_skiplist_not_normalized, 180, 0) \ + x(snapshot_skiplist_bad, 181, 0) \ + x(snapshot_should_not_have_subvol, 182, 0) \ + x(snapshot_to_bad_snapshot_tree, 183, 0) \ + x(snapshot_bad_depth, 184, 0) \ + x(snapshot_bad_skiplist, 185, 0) \ + x(subvol_pos_bad, 186, 0) \ + x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_checksum_type_invalid, 199, 0) \ + x(inode_compression_type_invalid, 200, 0) \ + x(inode_subvol_root_but_not_dir, 201, 0) \ + x(inode_i_size_dirty_but_clean, 202, 0) \ + x(inode_i_sectors_dirty_but_clean, 203, 0) \ + x(inode_i_sectors_wrong, 204, 0) \ + x(inode_dir_wrong_nlink, 205, 0) \ + x(inode_dir_multiple_links, 206, 0) \ + x(inode_multiple_links_but_nlink_0, 207, 0) \ + x(inode_wrong_backpointer, 208, 0) \ + x(inode_wrong_nlink, 209, 0) \ + x(inode_unreachable, 210, 0) \ + x(deleted_inode_but_clean, 211, 0) \ + x(deleted_inode_missing, 212, 0) \ + x(deleted_inode_is_dir, 213, 0) \ + x(deleted_inode_not_unlinked, 214, 0) \ + x(extent_overlapping, 215, 0) \ + x(extent_in_missing_inode, 216, 0) \ + x(extent_in_non_reg_inode, 217, 0) \ + x(extent_past_end_of_inode, 218, 0) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ + x(dirent_d_type_wrong, 225, 0) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ + x(xattr_val_size_too_small, 233, 0) \ + x(xattr_val_size_too_big, 234, 0) \ + x(xattr_invalid_type, 235, 0) \ + x(xattr_name_invalid_chars, 236, 0) \ + x(xattr_in_missing_inode, 237, 0) \ + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ + x(dir_loop, 241, 0) \ + x(hash_table_key_duplicate, 242, 0) \ + x(hash_table_key_wrong_offset, 243, 0) \ + x(unlinked_inode_not_on_deleted_list, 244, 0) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ + x(inode_bi_subvol_missing, 247, 0) \ + x(inode_bi_subvol_wrong, 248, 0) \ + x(inode_points_to_missing_dirent, 249, 0) \ + x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_bi_parent_nonzero, 251, 0) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ + x(subvol_loop, 258, 0) \ + x(subvol_unreachable, 259, 0) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, 0) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ + x(subvol_inode_bad, 270, 0) \ + x(alloc_key_stripe_sectors_wrong, 271, 0) \ + x(accounting_mismatch, 272, 0) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(invalid_btree_id, 274, 0) \ + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) + +enum bch_sb_error_id { +#define x(t, n, ...) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 06c7a644f4..40325239c3 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,283 +4,6 @@ #include "darray.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; - struct bch_sb_error_entry_cpu { u64 id:16, nr:48; @@ -290,4 +13,3 @@ struct bch_sb_error_entry_cpu { typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ - diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 44b3f0cb7b..39196f2a41 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -3,11 +3,22 @@ #include "bcachefs.h" #include "btree_cache.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" +void bch2_dev_missing(struct bch_fs *c, unsigned dev) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); +} + +void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +{ + bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); +} + #define x(t, n, ...) [n] = #t, static const char * const bch2_iops_measurements[] = { BCH_IOPS_MEASUREMENTS() @@ -164,18 +175,14 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; - if (!bch2_member_exists(&m)) + if (!bch2_member_alive(&m)) return; - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); + prt_printf(out, "Device:\t%u\n", i); printbuf_indent_add(out, 2); - prt_printf(out, "Label:"); - prt_tab(out); + prt_printf(out, "Label:\t"); if (BCH_MEMBER_GROUP(&m)) { unsigned idx = BCH_MEMBER_GROUP(&m) - 1; @@ -189,103 +196,73 @@ static void member_to_text(struct printbuf *out, } prt_newline(out); - prt_printf(out, "UUID:"); - prt_tab(out); + prt_printf(out, "UUID:\t"); pr_uuid(out, m.uuid.b); prt_newline(out); - prt_printf(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, device_size << 9); prt_newline(out); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s errors:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); - for (unsigned i = 0; i < BCH_IOPS_NR; i++) { - prt_printf(out, "%s iops:", bch2_iops_measurements[i]); - prt_tab(out); - prt_printf(out, "%u", le32_to_cpu(m.iops[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) + prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); - prt_printf(out, "Bucket size:"); - prt_tab(out); + prt_printf(out, "Bucket size:\t"); prt_units_u64(out, bucket_size << 9); prt_newline(out); - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); - prt_newline(out); + prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); + prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); - prt_printf(out, "Last mount:"); - prt_tab(out); + prt_printf(out, "Last mount:\t"); if (m.last_mount) bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); - prt_printf(out, "Last superblock write:"); - prt_tab(out); - prt_u64(out, le64_to_cpu(m.seq)); - prt_newline(out); + prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", + prt_printf(out, "State:\t%s\n", BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR ? bch2_member_states[BCH_MEMBER_STATE(&m)] : "unknown"); - prt_newline(out); - prt_printf(out, "Data allowed:"); - prt_tab(out); + prt_printf(out, "Data allowed:\t"); if (BCH_MEMBER_DATA_ALLOWED(&m)) prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); - prt_printf(out, "Has data:"); - prt_tab(out); + prt_printf(out, "Has data:\t"); if (data_have) prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); - prt_str(out, "Durability:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_printf(out, "Btree allocated bitmap blocksize:\t"); + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); prt_newline(out); - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Btree allocated bitmap:\t"); + bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); prt_newline(out); - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); - prt_newline(out); + prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + + prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); + prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); printbuf_indent_sub(out, 2); } -static int bch2_sb_members_v1_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; @@ -333,9 +310,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, member_to_text(out, members_v2_get(mi, i), gi, sb, i); } -static int bch2_sb_members_v2_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) +static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - @@ -390,12 +366,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); printbuf_indent_sub(out, 2); prt_str(out, "IO errors since "); @@ -404,12 +376,9 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_newline(out); printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { - prt_printf(out, "%s:", bch2_member_error_strs[i]); - prt_tab(out); - prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - prt_newline(out); - } + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); printbuf_indent_sub(out, 2); } @@ -437,11 +406,20 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) - if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev), - ptr->offset, btree_sectors(c))) - return false; - return true; + bool ret = true; + rcu_read_lock(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { + ret = false; + break; + } + } + rcu_read_unlock(); + return ret; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, @@ -463,6 +441,9 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } + BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(end > 64ULL << m->btree_bitmap_shift); + for (unsigned bit = start >> m->btree_bitmap_shift; (u64) bit << m->btree_bitmap_shift < end; bit++) @@ -476,6 +457,10 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) lockdep_assert_held(&c->sb_lock); struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (!bch2_member_exists(c->disk_sb.sb, ptr->dev)) + continue; + __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); + } } diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 5bf27d30ca..dd93192ec0 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -29,19 +29,6 @@ static inline bool bch2_dev_is_readable(struct bch_dev *ca) ca->mi.state != BCH_MEMBER_STATE_failed; } -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) { return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); @@ -105,14 +92,41 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +static inline void bch2_dev_get(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L); +#else + percpu_ref_get(&ca->ref); +#endif +} + +static inline void __bch2_dev_put(struct bch_dev *ca) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + long r = atomic_long_dec_return(&ca->ref); + if (r < (long) !ca->dying) + panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put); + ca->last_put = _THIS_IP_; + if (!r) + complete(&ca->ref_completion); +#else + percpu_ref_put(&ca->ref); +#endif +} + +static inline void bch2_dev_put(struct bch_dev *ca) { - rcu_read_lock(); if (ca) - percpu_ref_put(&ca->ref); + __bch2_dev_put(ca); +} +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) +{ + rcu_read_lock(); + bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) - percpu_ref_get(&ca->ref); + bch2_dev_get(ca); rcu_read_unlock(); return ca; @@ -158,26 +172,113 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, #define for_each_readable_member(c, ca) \ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + return dev < c->sb.nr_devices && c->devs[dev]; +} - return rcu_dereference_check(c->devs[idx], 1); +static inline bool bucket_valid(const struct bch_dev *ca, u64 b) +{ + return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first; } -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev) { - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_check(c->devs[dev], 1); +} - return rcu_dereference_protected(c->devs[idx], +static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) +{ + EBUG_ON(!bch2_dev_exists(c, dev)); + + return rcu_dereference_protected(c->devs[dev], lockdep_is_held(&c->sb_lock) || lockdep_is_held(&c->state_lock)); } +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + return c && dev < c->sb.nr_devices + ? rcu_dereference(c->devs[dev]) + : NULL; +} + +static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca) + bch2_dev_get(ca); + rcu_read_unlock(); + return ca; +} + +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + if (!ca) + bch2_dev_missing(c, dev); + return ca; +} + +static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); + if (ca && !bucket_valid(ca, bucket.offset)) { + bch2_dev_put(ca); + ca = NULL; + } + return ca; +} + +void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); + +static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); + if (!ca) + bch2_dev_bucket_missing(c, bucket); + return ca; +} + +static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget_noerror(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) +{ + if (ca && ca->dev_idx == dev_idx) + return ca; + bch2_dev_put(ca); + return bch2_dev_tryget(c, dev_idx); +} + +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (ca && + (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + return ca; + + if (ca) + percpu_ref_put(&ca->io_ref); + return NULL; +} + /* XXX kill, move to struct bch_fs */ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { @@ -192,16 +293,16 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; -static inline bool bch2_member_exists(struct bch_member *m) +static inline bool bch2_member_alive(struct bch_member *m) { return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } -static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) { if (dev < sb->nr_devices) { struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_exists(&m); + return bch2_member_alive(&m); } return false; } @@ -210,6 +311,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { .nbuckets = le64_to_cpu(mi->nbuckets), + .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) - + le16_to_cpu(mi->first_bucket), .first_bucket = le16_to_cpu(mi->first_bucket), .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), @@ -220,7 +323,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), + .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), }; diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h new file mode 100644 index 0000000000..e2630548c0 --- /dev/null +++ b/fs/bcachefs/sb-members_format.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H +#define _BCACHEFS_SB_MEMBERS_FORMAT_H + +/* + * We refer to members with bitmasks in various places - but we need to get rid + * of this limit: + */ +#define BCH_SB_MEMBERS_MAX 64 + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __u8 btree_bitmap_shift; + __u8 pad[3]; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; + __le64 seq; + __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; +}; + +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h new file mode 100644 index 0000000000..c0eda888fe --- /dev/null +++ b/fs/bcachefs/sb-members_types.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H +#define _BCACHEFS_SB_MEMBERS_TYPES_H + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u64 nbuckets_minus_first; + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h index c1860d8163..c4b3d8d3f4 100644 --- a/fs/bcachefs/seqmutex.h +++ b/fs/bcachefs/seqmutex.h @@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock) static inline void seqmutex_lock(struct seqmutex *lock) { mutex_lock(&lock->lock); -} - -static inline void seqmutex_unlock(struct seqmutex *lock) -{ lock->seq++; - mutex_unlock(&lock->lock); } -static inline u32 seqmutex_seq(struct seqmutex *lock) +static inline u32 seqmutex_unlock(struct seqmutex *lock) { - return lock->seq; + u32 seq = lock->seq; + mutex_unlock(&lock->lock); + return seq; } static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 544322d5c2..24023d6a96 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { int ret = 0; @@ -49,7 +49,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot_tree *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) ret = -BCH_ERR_ENOENT_snapshot_tree; @@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); + if (unlikely(new_bytes > INT_MAX)) + return NULL; + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; @@ -223,7 +226,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_snapshot s; @@ -298,7 +301,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct snapshot_t *t; @@ -352,7 +355,7 @@ err: int bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); } @@ -361,7 +364,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES, snapshot, s); + BTREE_ITER_with_updates, snapshot, s); } static int bch2_snapshot_live(struct btree_trans *trans, u32 id) @@ -618,7 +621,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); bch_err_fn(c, ret); @@ -695,7 +698,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, root = bch2_bkey_get_iter_typed(trans, &root_iter, BTREE_ID_snapshots, POS(0, root_id), - BTREE_ITER_WITH_UPDATES, snapshot); + BTREE_ITER_with_updates, snapshot); ret = bkey_err(root); if (ret) goto err; @@ -886,7 +889,7 @@ int bch2_check_snapshots(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, + BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot(trans, &iter, k))); bch_err_fn(c, ret); @@ -900,7 +903,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - u32 tree_id; + /* 0 is an invalid tree ID */ + u32 tree_id = 0; int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); if (ret) return ret; @@ -1001,7 +1005,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) r.btree = btree; ret = for_each_btree_key(trans, iter, btree, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ get_snapshot_trees(c, &r, k.k->p); })); if (ret) @@ -1018,7 +1022,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) darray_for_each(*t, id) { if (fsck_err_on(!bch2_snapshot_equiv(c, *id), c, snapshot_node_missing, - "snapshot node %u from tree %s missing", *id, buf.buf)) { + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1041,6 +1045,25 @@ err: return ret; } +int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1090,7 +1113,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) int ret = 0; s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); + BTREE_ITER_intent, snapshot); ret = bkey_err(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); @@ -1199,7 +1222,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); + POS_MIN, BTREE_ITER_intent); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) @@ -1350,35 +1373,39 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int snapshot_delete_key(struct btree_trans *trans, +static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, snapshot_id_list *deleted, snapshot_id_list *equiv_seen, struct bpos *last_pos) { + int ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ + return 0; if (!bkey_eq(k.k->p, *last_pos)) equiv_seen->nr = 0; - *last_pos = k.k->p; - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { + if (snapshot_list_has_id(deleted, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} + BTREE_UPDATE_internal_snapshot_node); -static int move_key_to_correct_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!bpos_eq(*last_pos, k.k->p) && + snapshot_list_has_id(equiv_seen, equiv)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + + *last_pos = k.k->p; + + ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + if (ret) + return ret; /* * When we have a linear chain of snapshot nodes, we consider @@ -1388,31 +1415,30 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, * * If there are multiple keys in different snapshots at the same * position, we're only going to keep the one in the newest - * snapshot - the rest have been overwritten and are redundant, - * and for the key we're going to keep we need to move it to the - * equivalance class ID if it's not there already. + * snapshot (we delete the others above) - the rest have been + * overwritten and are redundant, and for the key we're going to keep we + * need to move it to the equivalance class ID if it's not there + * already. */ if (equiv != k.k->p.snapshot) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - struct btree_iter new_iter; - int ret; - - ret = PTR_ERR_OR_ZERO(new); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; new->k.p.snapshot = equiv; + struct btree_iter new_iter; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); + BTREE_ITER_all_snapshots| + BTREE_ITER_cached| + BTREE_ITER_intent); ret = bch2_btree_iter_traverse(&new_iter) ?: bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + BTREE_UPDATE_internal_snapshot_node) ?: bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &new_iter); if (ret) return ret; @@ -1537,19 +1563,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) struct btree_trans *trans; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 id; int ret = 0; if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_started, &c->flags)) { - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); - if (ret) - return ret; - } - trans = bch2_trans_get(c); /* @@ -1584,33 +1602,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (ret) goto err; - for (id = 0; id < BTREE_ID_NR; id++) { + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct bpos last_pos = POS_MIN; snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(id)) - continue; - - /* - * deleted inodes btree is maintained by a trigger on the inodes - * btree - no work for us to do here, and it's not safe to scan - * it because we'll see out of date keys due to the btree write - * buffer: - */ - if (id == BTREE_ID_deleted_inodes) + if (!btree_type_has_snapshots(btree)) continue; ret = for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - move_key_to_correct_snapshot(trans, &iter, k)); + delete_dead_snapshots_process_key(trans, &iter, k, &deleted, + &equiv_seen, &last_pos)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); @@ -1643,7 +1648,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) * nodes some depth fields will be off: */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) @@ -1678,6 +1683,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } @@ -1699,8 +1706,8 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); while (1) { k = bch2_btree_iter_prev(&iter); ret = bkey_err(k); @@ -1752,7 +1759,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, pos.snapshot = leaf_id; - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index b7d2fed37c..31b0ee03e9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,11 +2,11 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bkey_invalid_flags; +enum bch_validate_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_tree_invalid, \ @@ -20,9 +20,10 @@ int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tre void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ @@ -77,7 +78,7 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) return 0; u32 parent = s->parent; - if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && parent && s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", @@ -135,11 +136,6 @@ static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) return id; } -static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -{ - return id == bch2_snapshot_equiv(c, id); -} - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); @@ -180,12 +176,9 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - const struct snapshot_t *t; - bool ret; - rcu_read_lock(); - t = snapshot_t(c, id); - ret = (t->children[0]|t->children[1]) != 0; + const struct snapshot_t *t = snapshot_t(c, id); + bool ret = t && (t->children[0]|t->children[1]) != 0; rcu_read_unlock(); return ret; @@ -249,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); +int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 3976f80721..c8c266cb57 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,16 +15,6 @@ #include <crypto/hash.h> #include <crypto/sha2.h> -typedef unsigned __bitwise bch_str_hash_flags_t; - -enum bch_str_hash_flags { - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, -}; - -#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) - static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -159,13 +149,14 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s desc.is_visible(inum, k)); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags, u32 snapshot) + enum btree_iter_update_trigger_flags flags, + u32 snapshot) { struct bkey_s_c k; int ret; @@ -173,10 +164,10 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|flags, k, ret) { + BTREE_ITER_slots|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return 0; + return k; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -186,20 +177,23 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; + return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); } -static __always_inline int +static __always_inline struct bkey_s_c bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { u32 snapshot; - return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: - bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); + int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); } static __always_inline int @@ -220,7 +214,7 @@ bch2_hash_hole(struct btree_trans *trans, for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) if (!is_visible_key(desc, inum, k)) return 0; bch2_trans_iter_exit(trans, iter); @@ -242,7 +236,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, bch2_btree_iter_advance(&iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -264,8 +258,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags, - int update_flags) + enum btree_iter_update_trigger_flags flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; @@ -277,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -286,8 +279,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, continue; } - if (!slot.path && - !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) + if (!slot.path && !(flags & STR_HASH_must_replace)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -305,16 +297,16 @@ found: found = true; not_found: - if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { - ret = -EEXIST; + } else if (found && (flags & STR_HASH_must_create)) { + ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, update_flags); + ret = bch2_trans_update(trans, &iter, insert, flags); } goto out; @@ -326,14 +318,14 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, struct bkey_i *insert, - bch_str_hash_flags_t str_hash_flags) + enum btree_iter_update_trigger_flags flags) { insert->k.p.inode = inum.inum; u32 snapshot; return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: bch2_hash_set_in_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + snapshot, insert, flags); } static __always_inline @@ -341,7 +333,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, struct btree_iter *iter, - unsigned update_flags) + enum btree_iter_update_trigger_flags flags) { struct bkey_i *delete; int ret; @@ -359,7 +351,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, update_flags); + return bch2_trans_update(trans, iter, delete, flags); } static __always_inline @@ -369,14 +361,10 @@ int bch2_hash_delete(struct btree_trans *trans, subvol_inum inum, const void *key) { struct btree_iter iter; - int ret; - - ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, - BTREE_ITER_INTENT); - if (ret) - return ret; - - ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_intent); + int ret = bkey_err(k) ?: + bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 88a79c8232..dfc9cf3057 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -162,7 +162,7 @@ int bch2_check_subvols(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol(trans, &iter, k))); bch_err_fn(c, ret); @@ -198,7 +198,7 @@ int bch2_check_subvol_children(struct bch_fs *c) { int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_subvol_child(trans, &iter, k))); bch_err_fn(c, ret); @@ -208,14 +208,23 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { + struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, subvol_pos_bad, "invalid pos"); + + bkey_fsck_err_on(!subvol.v->snapshot, c, err, + subvol_snapshot_bad, + "invalid snapshot"); + + bkey_fsck_err_on(!subvol.v->inode, c, err, + subvol_inode_bad, + "invalid inode"); fsck_err: return ret; } @@ -245,9 +254,9 @@ static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bo int bch2_subvolume_trigger(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { - if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (flags & BTREE_TRIGGER_transactional) { struct bpos children_pos_old = subvolume_children_pos(old); struct bpos children_pos_new = subvolume_children_pos(new.s_c); @@ -333,7 +342,7 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + BTREE_ITER_cached|BTREE_ITER_with_updates, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -383,9 +392,9 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d return lockrestart_do(trans, bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_CACHED, &s)) ?: + BTREE_ITER_cached, &s)) ?: for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.creation_parent))); @@ -404,7 +413,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) subvol = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED|BTREE_ITER_INTENT, + BTREE_ITER_cached|BTREE_ITER_intent, subvolume); ret = bkey_err(subvol); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -505,7 +514,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) n = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, @@ -547,7 +556,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, BTREE_ID_subvolumes, POS(0, src_subvolid), - BTREE_ITER_CACHED, subvolume); + BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); if (unlikely(ret)) { bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index d2015d549b..afa5e871ef 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,16 +5,17 @@ #include "darray.h" #include "subvolume_types.h" -enum bkey_invalid_flags; +enum bch_validate_flags; int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s, unsigned); + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index a337246e91..b156fc85b8 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -76,7 +76,7 @@ const char * const bch2_sb_fields[] = { }; static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, - struct printbuf *); + enum bch_validate_flags, struct printbuf *); struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) @@ -344,8 +344,8 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, - int rw) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + enum bch_validate_flags flags, struct printbuf *out) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; @@ -401,7 +401,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_time_precision; } - if (rw == READ) { + if (!flags) { /* * Been seeing a bug where these are getting inexplicably * zeroed, so we're now validating them, but we have to be @@ -457,7 +457,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, out); + ret = bch2_sb_field_validate(sb, &mi->field, flags, out); if (ret) return ret; @@ -465,12 +465,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) continue; - ret = bch2_sb_field_validate(sb, f, out); + ret = bch2_sb_field_validate(sb, f, flags, out); if (ret) return ret; } - if (rw == WRITE && + if ((flags & BCH_VALIDATE_write) && bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), @@ -649,9 +649,10 @@ reread: bytes = vstruct_bytes(sb->sb); - if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) { - prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", - bytes, 512UL << sb->sb->layout.sb_max_size_bits); + u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits); + if (bytes > sb_size) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)", + bytes, sb_size); return -BCH_ERR_invalid_sb_too_big; } @@ -819,7 +820,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, &err, READ); + ret = bch2_sb_validate(sb, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -975,7 +976,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE); + ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1020,26 +1021,35 @@ int bch2_write_super(struct bch_fs *c) continue; if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock write was silently dropped! (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { - bch2_fs_fatal_error(c, + struct printbuf buf = PRINTBUF; + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, ": Superblock modified by another process (seq %llu expected %llu)", le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); - percpu_ref_put(&ca->io_ref); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_erofs_sb_err; - goto out; } } + if (ret) + goto out; + do { wrote = false; darray_for_each(online_devices, cap) { @@ -1146,7 +1156,7 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { if (vstruct_bytes(f) < 88) { prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); @@ -1161,8 +1171,7 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_ext *e = field_to_type(f, ext); - prt_printf(out, "Recovery passes required:"); - prt_tab(out); + prt_printf(out, "Recovery passes required:\t"); prt_bitflags(out, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); prt_newline(out); @@ -1171,16 +1180,14 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, if (errors_silent) { le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); - prt_printf(out, "Errors to silently fix:"); - prt_tab(out); + prt_printf(out, "Errors to silently fix:\t"); prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); prt_newline(out); kfree(errors_silent); } - prt_printf(out, "Btrees with missing data:"); - prt_tab(out); + prt_printf(out, "Btrees with missing data:\t"); prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); prt_newline(out); } @@ -1207,14 +1214,14 @@ static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) } static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, - struct printbuf *err) + enum bch_validate_flags flags, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); struct printbuf field_err = PRINTBUF; const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; - ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0; if (ret) { prt_printf(err, "Invalid superblock section %s: %s", bch2_sb_fields[type], field_err.buf); @@ -1288,97 +1295,73 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, printbuf_tabstop_push(out, 44); for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_dev_exists(sb, i); + nr_devices += bch2_member_exists(sb, i); - prt_printf(out, "External UUID:"); - prt_tab(out); + prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); - prt_printf(out, "Internal UUID:"); - prt_tab(out); + prt_printf(out, "Internal UUID:\t"); pr_uuid(out, sb->uuid.b); prt_newline(out); - prt_printf(out, "Magic number:"); - prt_tab(out); + prt_printf(out, "Magic number:\t"); pr_uuid(out, sb->magic.b); prt_newline(out); - prt_str(out, "Device index:"); - prt_tab(out); - prt_printf(out, "%u", sb->dev_idx); - prt_newline(out); + prt_printf(out, "Device index:\t%u\n", sb->dev_idx); - prt_str(out, "Label:"); - prt_tab(out); + prt_printf(out, "Label:\t"); prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); prt_newline(out); - prt_str(out, "Version:"); - prt_tab(out); + prt_printf(out, "Version:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); - prt_str(out, "Version upgrade complete:"); - prt_tab(out); + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); - prt_printf(out, "Oldest version on disk:"); - prt_tab(out); + prt_printf(out, "Oldest version on disk:\t"); bch2_version_to_text(out, le16_to_cpu(sb->version_min)); prt_newline(out); - prt_printf(out, "Created:"); - prt_tab(out); + prt_printf(out, "Created:\t"); if (sb->time_base_lo) bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); - prt_printf(out, "Sequence number:"); - prt_tab(out); + prt_printf(out, "Sequence number:\t"); prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); - prt_printf(out, "Time of last write:"); - prt_tab(out); + prt_printf(out, "Time of last write:\t"); bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); prt_newline(out); - prt_printf(out, "Superblock size:"); - prt_tab(out); + prt_printf(out, "Superblock size:\t"); prt_units_u64(out, vstruct_bytes(sb)); prt_str(out, "/"); prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); - prt_printf(out, "Clean:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); - prt_newline(out); - - prt_printf(out, "Devices:"); - prt_tab(out); - prt_printf(out, "%u", nr_devices); - prt_newline(out); + prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); + prt_printf(out, "Devices:\t%u\n", nr_devices); - prt_printf(out, "Sections:"); + prt_printf(out, "Sections:\t"); vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); - prt_tab(out); prt_bitflags(out, bch2_sb_fields, fields_have); prt_newline(out); - prt_printf(out, "Features:"); - prt_tab(out); + prt_printf(out, "Features:\t"); prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); prt_newline(out); - prt_printf(out, "Compat features:"); - prt_tab(out); + prt_printf(out, "Compat features:\t"); prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); prt_newline(out); @@ -1395,8 +1378,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, if (opt->get_sb != BCH2_NO_SB_OPT) { u64 v = bch2_opt_from_sb(sb, id); - prt_printf(out, "%s:", opt->attr.name); - prt_tab(out); + prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 95e80e0631..fadd364e28 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -51,7 +51,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, + enum bch_validate_flags, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index dddf57ec45..da735608d4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -264,7 +264,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); - bch2_gc_thread_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", @@ -285,7 +284,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); - if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); @@ -467,7 +466,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ - set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + set_bit(JOURNAL_need_flush_write, &c->journal.flags); + set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); @@ -485,12 +485,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) } #endif - ret = bch2_gc_thread_start(c); - if (ret) { - bch_err(c, "error starting gc thread"); - return ret; - } - ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; @@ -537,14 +531,11 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { - unsigned i; - - for (i = 0; i < BCH_TIME_STAT_NR; i++) + for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_allocator_background_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); @@ -559,9 +550,9 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); - bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); bch2_fs_btree_cache_exit(c); + bch2_fs_btree_iter_exit(c); bch2_fs_replicas_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); @@ -572,7 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - free_percpu(c->online_reserved); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); + WARN(v, "online_reserved not 0 at shutdown: %lli", v); + free_percpu(c->online_reserved); + } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); @@ -589,8 +584,10 @@ static void __bch2_fs_free(struct bch_fs *c) if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); - if (c->io_complete_wq) - destroy_workqueue(c->io_complete_wq); + if (c->btree_write_submit_wq) + destroy_workqueue(c->btree_write_submit_wq); + if (c->btree_read_complete_wq) + destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) @@ -616,8 +613,6 @@ void __bch2_fs_stop(struct bch_fs *c) set_bit(BCH_FS_stopping, &c->flags); - cancel_work_sync(&c->journal_seq_blacklist_gc_work); - down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); @@ -665,6 +660,7 @@ void bch2_fs_free(struct bch_fs *c) struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { + EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -719,7 +715,7 @@ static int bch2_fs_online(struct bch_fs *c) ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); goto err; } } @@ -778,6 +774,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); + bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); @@ -800,16 +797,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->journal_seq_blacklist_gc_work, - bch2_blacklist_entries_gc); - INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); - seqcount_init(&c->gc_pos_lock); - seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); @@ -890,8 +882,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -920,9 +914,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_replicas_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: @@ -939,12 +933,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - for (i = 0; i < c->sb.nr_devices; i++) - if (bch2_dev_exists(c->disk_sb.sb, i) && - bch2_dev_alloc(c, i)) { - ret = -EEXIST; + for (i = 0; i < c->sb.nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + ret = bch2_dev_alloc(c, i); + if (ret) goto err; - } + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, @@ -1101,7 +1096,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) + if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) @@ -1200,11 +1195,12 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + kfree(ca->buckets_nouse); bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); - bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); @@ -1212,7 +1208,9 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); +#endif kobject_put(&ca->kobj); } @@ -1239,12 +1237,14 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) bch2_dev_journal_exit(ca); } +#ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } +#endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { @@ -1313,14 +1313,19 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); - if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, - 0, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, +#ifndef CONFIG_BCACHEFS_DEBUG + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) + goto err; +#else + atomic_long_set(&ca->ref, 1); +#endif + + bch2_dev_allocator_background_init(ca); + + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || - bioset_init(&ca->replica_set, 4, - offsetof(struct bch_write_bio, bio), 0) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1411,10 +1416,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); - BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || - !c->devs[sb->sb->dev_idx]); + BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); - ca = bch_dev_locked(c, sb->sb->dev_idx); + ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) @@ -1506,10 +1510,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_dev_exists(c->disk_sb.sb, i)) + if (!bch2_member_exists(c->disk_sb.sb, i)) continue; - ca = bch_dev_locked(c, i); + ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || @@ -1530,6 +1534,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); + bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } @@ -1541,6 +1546,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1599,17 +1605,17 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * with bch2_do_invalidates() and bch2_do_discards() */ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_NORUN, NULL) ?: + BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_NORUN, NULL); + BTREE_TRIGGER_norun, NULL); bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1626,7 +1632,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ - percpu_ref_put(&ca->ref); + bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); @@ -1678,7 +1684,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); +#ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); +#else + ca->dying = true; + bch2_dev_put(ca); +#endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); @@ -1761,7 +1772,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; @@ -1777,9 +1788,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) - goto have_slot; + if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) { + dev_idx = c->sb.nr_devices; + goto have_slot; + } + + int best = -1; + u64 best_last_mount = 0; + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } no_slot: ret = -BCH_ERR_ENOSPC_sb_members; bch_err_msg(c, ret, "setting up new superblock"); @@ -1821,7 +1851,7 @@ have_slot: bch2_dev_usage_journal_reserve(c); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; @@ -1884,9 +1914,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (ret) goto err; - ca = bch_dev_locked(c, dev_idx); + ca = bch2_dev_locked(c, dev_idx); - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; @@ -1902,7 +1932,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; @@ -1979,7 +2009,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (ret) goto err; - ret = bch2_trans_mark_dev_sb(c, ca); + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 11bcef170c..368a63d938 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -26,19 +26,4 @@ struct bch_devs_list { u8 data[BCH_BKEY_PTRS_MAX]; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u16 group; - u8 state; - u8 discard; - u8 data_allowed; - u8 durability; - u8 freespace_initialized; - u8 valid; - u8 btree_bitmap_shift; - u64 btree_allocated_bitmap; -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 5be92fe3f4..93ca74d108 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -140,9 +140,8 @@ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); write_attribute(trigger_journal_flush); -write_attribute(prune_cache); -write_attribute(btree_wakeup); -rw_attribute(btree_gc_periodic); +write_attribute(trigger_btree_cache_shrink); +write_attribute(trigger_btree_key_cache_shrink); rw_attribute(gc_gens_pos); read_attribute(uuid); @@ -189,12 +188,8 @@ static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) { bch2_printbuf_tabstop_push(out, 24); - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { - prt_str(out, bch2_write_refs[i]); - prt_tab(out); - prt_printf(out, "%li", atomic_long_read(&c->writes[i])); - prt_newline(out); - } + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) + prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); } #endif @@ -278,7 +273,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c continue; ret = for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + BTREE_ITER_all_snapshots, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; @@ -313,22 +308,11 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_str(out, "type"); printbuf_tabstop_push(out, 12); - prt_tab(out); - - prt_str(out, "compressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "uncompressed"); printbuf_tabstop_push(out, 16); - prt_tab_rjust(out); - - prt_str(out, "average extent size"); printbuf_tabstop_push(out, 24); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { bch2_prt_compression_type(out, i); @@ -362,21 +346,6 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "\n"); } -static void bch2_btree_wakeup_all(struct bch_fs *c) -{ - struct btree_trans *trans; - - seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); - - if (b) - six_lock_wakeup_all(&b->lock); - - } - seqmutex_unlock(&c->btree_trans_lock); -} - SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -392,8 +361,6 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_write_stats) bch2_btree_write_stats_to_text(out, c); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); @@ -416,7 +383,7 @@ SHOW(bch2_fs) bch2_journal_debug_to_text(out, &c->journal); if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, c); + bch2_btree_cache_to_text(out, &c->btree_cache); if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); @@ -459,6 +426,9 @@ SHOW(bch2_fs) if (attr == &sysfs_disk_groups) bch2_disk_groups_to_text(out, c); + if (attr == &sysfs_alloc_debug) + bch2_fs_alloc_debug_to_text(out, c); + return 0; } @@ -466,14 +436,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_btree_gc_periodic) { - ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) - ?: (ssize_t) size; - - wake_up_process(c->gc_thread); - return ret; - } - if (attr == &sysfs_copy_gc_enabled) { ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; @@ -505,7 +467,7 @@ STORE(bch2_fs) if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; - if (attr == &sysfs_prune_cache) { + if (attr == &sysfs_trigger_btree_cache_shrink) { struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; @@ -513,22 +475,17 @@ STORE(bch2_fs) c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } - if (attr == &sysfs_btree_wakeup) - bch2_btree_wakeup_all(c); - - if (attr == &sysfs_trigger_gc) { - /* - * Full gc is currently incompatible with btree key cache: - */ -#if 0 - down_read(&c->state_lock); - bch2_gc(c, false, false); - up_read(&c->state_lock); -#else - bch2_gc_gens(c); -#endif + if (attr == &sysfs_trigger_btree_key_cache_shrink) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); } + if (attr == &sysfs_trigger_gc) + bch2_gc_gens(c); + if (attr == &sysfs_trigger_discards) bch2_do_discards(c); @@ -594,13 +551,11 @@ SHOW(bch2_fs_counters) if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ - prt_printf(out, "since mount:"); \ - prt_tab(out); \ + prt_printf(out, "since mount:\t"); \ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ - prt_printf(out, "since filesystem creation:"); \ - prt_tab(out); \ + prt_printf(out, "since filesystem creation:\t"); \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -660,8 +615,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_discards, &sysfs_trigger_invalidates, &sysfs_trigger_journal_flush, - &sysfs_prune_cache, - &sysfs_btree_wakeup, + &sysfs_trigger_btree_cache_shrink, + &sysfs_trigger_btree_key_cache_shrink, &sysfs_gc_gens_pos, @@ -677,6 +632,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_internal_uuid, &sysfs_disk_groups, + &sysfs_alloc_debug, NULL }; @@ -792,88 +748,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); - unsigned i, nr[BCH_DATA_NR]; - - memset(nr, 0, sizeof(nr)); - - for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].data_type]++; - - printbuf_tabstop_push(out, 8); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - - bch2_dev_usage_to_text(out, &stats); - - prt_newline(out); - - prt_printf(out, "reserves:"); - prt_newline(out); - for (i = 0; i < BCH_WATERMARK_NR; i++) { - prt_str(out, bch2_watermarks[i]); - prt_tab(out); - prt_u64(out, bch2_dev_buckets_reserved(ca, i)); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_newline(out); - - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 24); - - prt_str(out, "freelist_wait"); - prt_tab(out); - prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open buckets allocated"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); - prt_newline(out); - - prt_str(out, "open buckets this dev"); - prt_tab(out); - prt_u64(out, ca->nr_open_buckets); - prt_newline(out); - - prt_str(out, "open buckets total"); - prt_tab(out); - prt_u64(out, OPEN_BUCKETS_COUNT); - prt_newline(out); - - prt_str(out, "open_buckets_wait"); - prt_tab(out); - prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); - prt_newline(out); - - prt_str(out, "open_buckets_btree"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_btree]); - prt_newline(out); - - prt_str(out, "open_buckets_user"); - prt_tab(out); - prt_u64(out, nr[BCH_DATA_user]); - prt_newline(out); - - prt_str(out, "buckets_to_invalidate"); - prt_tab(out); - prt_u64(out, should_invalidate_buckets(ca, stats)); - prt_newline(out); - - prt_str(out, "btree reserve cache"); - prt_tab(out); - prt_u64(out, c->btree_reserve_cache_nr); - prt_newline(out); -} - static const char * const bch2_rw[] = { "read", "write", @@ -943,7 +817,7 @@ SHOW(bch2_dev) * 100 / CONGESTED_MAX); if (attr == &sysfs_alloc_debug) - dev_alloc_debug_to_text(out, ca); + bch2_dev_alloc_debug_to_text(out, ca); return 0; } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index bfec656f94..68104b2056 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -40,7 +40,7 @@ static int test_delete(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -81,7 +81,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, bch2_btree_iter_traverse(&iter) ?: @@ -261,7 +261,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i >= nr * 2) break; @@ -322,7 +322,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_trans_run(c, for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ + BTREE_ITER_slots, k, ({ if (i == nr) break; BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -452,7 +452,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); return ret; } @@ -671,7 +671,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) int ret = 0; bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); + BTREE_ITER_intent); k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) @@ -714,7 +714,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + BTREE_ITER_slots|BTREE_ITER_intent, k, NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; @@ -737,7 +737,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), - BTREE_ITER_INTENT, k, + BTREE_ITER_intent, k, NULL, NULL, 0, ({ struct bkey_i_cookie u; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6aa81d1e6d..84fcf26e30 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str, TP_fast_assign( __entry->dev = c->dev; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) @@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str, __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %pS %s", @@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, TP_fast_assign( __entry->dev = trans->c->dev; strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __assign_str(str, str); + __assign_str(str); ), TP_printk("%d,%d %s %s", @@ -638,99 +638,14 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -DECLARE_EVENT_CLASS(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err), - - TP_STRUCT__entry( - __field(u8, dev ) - __array(char, reserve, 16 ) - __field(u64, bucket ) - __field(u64, free ) - __field(u64, avail ) - __field(u64, copygc_wait_amount ) - __field(s64, copygc_waiting_for ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, nouse ) - __field(bool, nonblocking ) - __field(u64, nocow ) - __array(char, err, 32 ) - ), - - TP_fast_assign( - __entry->dev = ca->dev_idx; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->bucket = bucket; - __entry->free = free; - __entry->avail = avail; - __entry->copygc_wait_amount = copygc_wait_amount; - __entry->copygc_waiting_for = copygc_waiting_for; - __entry->seen = s->buckets_seen; - __entry->open = s->skipped_open; - __entry->need_journal_commit = s->skipped_need_journal_commit; - __entry->nouse = s->skipped_nouse; - __entry->nonblocking = nonblocking; - __entry->nocow = s->skipped_nocow; - strscpy(__entry->err, err, sizeof(__entry->err)); - ), - - TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", - __entry->reserve, - __entry->dev, - __entry->bucket, - __entry->free, - __entry->avail, - __entry->copygc_wait_amount, - __entry->copygc_waiting_for, - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->nouse, - __entry->nocow, - __entry->nonblocking, - __entry->err) +DEFINE_EVENT(fs_str, bucket_alloc, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bucket_alloc, bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) -); - -DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - u64 bucket, - u64 free, - u64 avail, - u64 copygc_wait_amount, - s64 copygc_waiting_for, - struct bucket_alloc_state *s, - bool nonblocking, - const char *err), - TP_ARGS(ca, alloc_reserve, bucket, free, avail, - copygc_wait_amount, copygc_waiting_for, - s, nonblocking, err) +DEFINE_EVENT(fs_str, bucket_alloc_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(discard_buckets, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 92c6ad75e7..4ec7e44d6e 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) +static void __bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { + bool locked = false; const char *p; if (!lines) { @@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) return; } - console_lock(); + if (!nonblocking) { + console_lock(); + locked = true; + } else { + locked = console_trylock(); + } + while (1) { p = strchrnul(lines, '\n'); printk("%s%.*s\n", prefix, (int) (p - lines), lines); @@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) break; lines = p + 1; } - console_unlock(); + if (locked) + console_unlock(); +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, false); +} + +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, true); } int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, @@ -348,15 +367,12 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); - prt_tab_rjust(out); - prt_printf(out, "%s", u->name); + prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name); } static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { - prt_str(out, name); - prt_tab(out); + prt_printf(out, "%s\t", name); bch2_pr_time_units_aligned(out, ns); prt_newline(out); } @@ -389,12 +405,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats } printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); - prt_printf(out, "count:"); - prt_tab(out); - prt_printf(out, "%llu ", - stats->duration_stats.n); + prt_printf(out, "count:\t%llu\n", stats->duration_stats.n); printbuf_tabstop_pop(out); - prt_newline(out); printbuf_tabstops_reset(out); @@ -403,13 +415,8 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 0); printbuf_tabstop_push(out, TABSTOP_SIZE + 2); - prt_tab(out); - prt_printf(out, "since mount"); - prt_tab_rjust(out); - prt_tab(out); + prt_printf(out, "\tsince mount\r\trecent\r\n"); prt_printf(out, "recent"); - prt_tab_rjust(out); - prt_newline(out); printbuf_tabstops_reset(out); printbuf_tabstop_push(out, out->indent + 20); @@ -417,23 +424,20 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, 2); printbuf_tabstop_push(out, TABSTOP_SIZE); - prt_printf(out, "duration of events"); - prt_newline(out); + prt_printf(out, "duration of events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); pr_name_and_units(out, "total:", stats->total_duration); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -441,22 +445,19 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_indent_sub(out, 2); prt_newline(out); - prt_printf(out, "time between events"); - prt_newline(out); + prt_printf(out, "time between events\n"); printbuf_indent_add(out, 2); pr_name_and_units(out, "min:", stats->min_freq); pr_name_and_units(out, "max:", stats->max_freq); - prt_printf(out, "mean:"); - prt_tab(out); + prt_printf(out, "mean:\t"); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); - prt_printf(out, "stddev:"); - prt_tab(out); + prt_printf(out, "stddev:\t"); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); @@ -589,40 +590,31 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro if (!out->nr_tabstops) printbuf_tabstop_push(out, 20); - prt_printf(out, "rate:"); - prt_tab(out); + prt_printf(out, "rate:\t"); prt_human_readable_s64(out, pd->rate.rate); prt_newline(out); - prt_printf(out, "target:"); - prt_tab(out); + prt_printf(out, "target:\t"); prt_human_readable_u64(out, pd->last_target); prt_newline(out); - prt_printf(out, "actual:"); - prt_tab(out); + prt_printf(out, "actual:\t"); prt_human_readable_u64(out, pd->last_actual); prt_newline(out); - prt_printf(out, "proportional:"); - prt_tab(out); + prt_printf(out, "proportional:\t"); prt_human_readable_s64(out, pd->last_proportional); prt_newline(out); - prt_printf(out, "derivative:"); - prt_tab(out); + prt_printf(out, "derivative:\t"); prt_human_readable_s64(out, pd->last_derivative); prt_newline(out); - prt_printf(out, "change:"); - prt_tab(out); + prt_printf(out, "change:\t"); prt_human_readable_s64(out, pd->last_change); prt_newline(out); - prt_printf(out, "next io:"); - prt_tab(out); - prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); - prt_newline(out); + prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); } /* misc: */ diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 5cf885b099..5b0533ec4c 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -315,6 +315,7 @@ void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); @@ -445,11 +446,6 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) void bch2_bio_map(struct bio *bio, void *base, size_t); int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -static inline sector_t bdev_sectors(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> 9; -} - #define closure_bio_submit(bio, cl) \ do { \ closure_get(cl); \ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 754f17bba6..c11bf6dacc 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, + enum bch_validate_flags flags, struct printbuf *err) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); @@ -118,11 +118,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, else prt_printf(out, "(unknown type %u)", xattr.v->x_type); + unsigned name_len = xattr.v->x_name_len; + unsigned val_len = le16_to_cpu(xattr.v->x_val_len); + unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - + offsetof(struct bch_xattr, x_name); + + val_len = min_t(int, val_len, max_name_val_bytes - name_len); + name_len = min(name_len, max_name_val_bytes); + prt_printf(out, "%.*s:%.*s", - xattr.v->x_name_len, - xattr.v->x_name, - le16_to_cpu(xattr.v->x_val_len), - (char *) xattr_val(xattr.v)); + name_len, xattr.v->x_name, + val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { @@ -138,21 +144,13 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); struct btree_iter iter; - struct bkey_s_c_xattr xattr; - struct bkey_s_c k; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), &search, 0); + struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + int ret = bkey_err(k); if (ret) - goto err1; - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err2; + return ret; - xattr = bkey_s_c_to_xattr(k); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -160,10 +158,8 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } -err2: bch2_trans_iter_exit(trans, &iter); -err1: - return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; + return ret; } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, @@ -177,7 +173,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: - bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent); if (ret) return ret; @@ -212,8 +208,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, inum, &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + (flags & XATTR_CREATE ? STR_HASH_must_create : 0)| + (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0)); } else { struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); @@ -359,6 +355,9 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + if (ret < 0 && bch2_err_matches(ret, ENOENT)) + ret = -ENODATA; + return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 1337f31a5c..1574b9eb4c 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -7,7 +7,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, - enum bkey_invalid_flags, struct printbuf *); + enum bch_validate_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ |