diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 18:50:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 18:50:03 +0000 |
commit | 01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch) | |
tree | b406c5242a088c4f59c6e4b719b783f43aca6ae9 /fs/bcachefs | |
parent | Adding upstream version 6.7.12. (diff) | |
download | linux-upstream/6.8.9.tar.xz linux-upstream/6.8.9.zip |
Adding upstream version 6.8.9.upstream/6.8.9
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
146 files changed, 8616 insertions, 7306 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fddc7be580..5cdfef3b55 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL depends on BCACHEFS_FS select FS_POSIX_ACL -config BCACHEFS_DEBUG_TRANSACTIONS - bool "bcachefs runtime info" - depends on BCACHEFS_FS - help - This makes the list of running btree transactions available in debugfs. - - This is a highly useful debugging feature but does add a small amount of overhead. - config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS @@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT help This disables device latency tracking and time stats, only for performance testing +config BCACHEFS_SIX_OPTIMISTIC_SPIN + bool "Optimistic spinning for six locks" + depends on BCACHEFS_FS + depends on SMP + default y + help + Instead of immediately sleeping when attempting to take a six lock that + is held by another thread, spin for a short while, as long as the + thread owning the lock is running. + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b812684181..1a05cecda7 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,7 +27,6 @@ bcachefs-y := \ checksum.o \ clock.o \ compress.o \ - counters.o \ darray.o \ debug.o \ dirent.o \ @@ -71,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-counters.o \ sb-downgrade.o \ sb-errors.o \ sb-members.o \ @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + thread_with_file.o \ trace.o \ two_state_shared_lock.o \ util.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1fec0e6789..fd3e175d83 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, c, err, - alloc_key_empty_but_have_data, + bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; case BCH_DATA_sb: @@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors, c, err, - alloc_key_dirty_sectors_0, + bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), + c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); + bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe, c, err, - alloc_key_cached_inconsistency, + bch2_bucket_sectors_dirty(*a.v) || + a.v->stripe, + c, err, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, - alloc_key_cached_but_read_time_zero, + c, err, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -324,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - unsigned i; prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, - a->data_type < BCH_DATA_NR - ? bch2_data_types[a->data_type] - : "(invalid data type)"); + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); @@ -356,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - prt_newline(out); - - if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { - struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); - printbuf_indent_add(out, 2); - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); - } - - printbuf_indent_sub(out, 2); - } - printbuf_indent_sub(out, 2); } @@ -537,18 +513,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; - unsigned offset; - struct bpos pos; - u8 gen; int ret; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -556,13 +526,14 @@ int bch2_bucket_gens_init(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - gen = bch2_alloc_to_v4(k, &a)->gen; - pos = alloc_gens_pos(iter.pos, &offset); + struct bch_alloc_v4 a; + u8 gen = bch2_alloc_to_v4(k, &a)->gen; + unsigned offset; + struct bpos pos = alloc_gens_pos(iter.pos, &offset); if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; @@ -576,45 +547,37 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - const struct bch_bucket_gens *g; - u64 b; - - for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - g = bkey_s_c_to_bucket_gens(k).v; + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; /* * Not a fsck error because this is checked/repaired by @@ -623,19 +586,17 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_exists2(c, k.k->p.inode)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - for (b = max_t(u64, ca->mi.first_bucket, start); + for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } else { - struct bch_alloc_v4 a; - - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -643,19 +604,18 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); up_read(&c->gc_lock); - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); return ret; } @@ -768,83 +728,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; } -int bch2_trans_mark_alloc(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, *new_a; - const struct bch_alloc_v4 *old_a; - u64 old_lru, new_lru; int ret = 0; - /* - * Deletion only happens in the device removal path, with - * BTREE_TRIGGER_NORUN: - */ - BUG_ON(new->k.type != KEY_TYPE_alloc_v4); + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = &bkey_i_to_alloc_v4(new)->v; + struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + struct bch_alloc_v4 old_a_convert; + const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (new_a->dirty_sectors > old_a->dirty_sectors || - new_a->cached_sectors > old_a->cached_sectors) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - } + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); - if (ret) - return ret; - } + if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } + + if (old_a->data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, new.s_c, new_a, true); + if (ret) + return ret; + } - old_lru = alloc_lru_idx_read(*old_a); - new_lru = alloc_lru_idx_read(*new_a); + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new->k.p.inode, - bucket_to_u64(new->k.p), - old_lru, new_lru); - if (ret) - return ret; + u64 old_lru = alloc_lru_idx_read(*old_a); + u64 new_lru = alloc_lru_idx_read(*new_a); + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + old_lru, new_lru); + if (ret) + return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new.k->p.inode)); + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new.k->p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); + if (ret) + return ret; + } + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, + -((s64) old_a->cached_sectors)); + if (ret) + return ret; + } } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new->k.p.inode)); + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq = new_a->journal_seq; - if (old_a->fragmentation_lru != new_a->fragmentation_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new->k.p), - old_a->fragmentation_lru, new_a->fragmentation_lru); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_INSERT) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + } + + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + bucket_journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + percpu_down_read(&c->mark_lock); + if (new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a->data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + percpu_up_read(&c->mark_lock); } - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_GC) && + (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + struct bch_alloc_v4 new_a_convert; + const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); + + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; + + bucket_unlock(g); + percpu_up_read(&c->mark_lock); } return 0; @@ -869,8 +923,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos bch2_trans_copy_iter(&iter2, iter); - if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + struct btree_path *path = btree_iter_path(iter->trans, iter); + if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); @@ -898,7 +953,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos static bool next_bucket(struct bch_fs *c, struct bpos *bucket) { struct bch_dev *ca; - unsigned iter; if (bch2_dev_bucket_exists(c, *bucket)) return true; @@ -916,8 +970,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket) } rcu_read_lock(); - iter = bucket->inode; - ca = __bch2_next_dev(c, &iter, NULL); + ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (ca) *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); @@ -1158,9 +1211,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - if (c->sb.version < bcachefs_metadata_version_bucket_gens) - return 0; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); @@ -1212,7 +1262,7 @@ fsck_err: return ret; } -static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, +static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; @@ -1267,28 +1317,10 @@ delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); goto out; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end) -{ - if (!btree_id_is_extents(iter->btree_id)) { - return __bch2_check_discard_freespace_key(trans, iter); - } else { - int ret = 0; - - while (!bkey_eq(iter->pos, end) && - !(ret = btree_trans_too_many_iters(trans) ?: - __bch2_check_discard_freespace_key(trans, iter))) - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); - - return ret; - } -} - /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent @@ -1422,8 +1454,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1442,23 +1473,50 @@ bkey_err: if (ret < 0) goto err; - ret = for_each_btree_key2(trans, iter, + ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key2(trans, iter, - BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(trans, iter, + bch2_check_discard_freespace_key(trans, &iter)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k) ?: + bch2_check_discard_freespace_key(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err(c, "while checking %s", buf.buf); + printbuf_exit(&buf); + break; + } + + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1486,6 +1544,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (a->data_type != BCH_DATA_cached) return 0; + if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + + a = &a_mut->v; + } + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, lru_pos(alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), @@ -1494,41 +1573,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(!a->io_time[READ], c, - alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || - fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - u64 read_time = a->io_time[READ] ?: - atomic64_read(&c->io_clock[READ].now); - ret = bch2_lru_set(trans, alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), - read_time); + a->io_time[READ]); if (ret) goto err; - - if (a->io_time[READ] != read_time) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = read_time; - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); - if (ret) - goto err; - } } err: fsck_err: @@ -1539,27 +1595,45 @@ fsck_err: int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 discarded; + struct bch_dev *ca; + u64 need_journal_commit_this_dev; +}; + +static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) +{ + if (s->ca == ca) + return; + + if (s->ca && s->need_journal_commit_this_dev > + bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) + bch2_journal_flush_async(&c->journal, NULL); + + if (s->ca) + percpu_ref_put(&s->ca->ref); + if (ca) + percpu_ref_get(&ca->ref); + s->ca = ca; + s->need_journal_commit_this_dev = 0; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - u64 *seen, - u64 *open, - u64 *need_journal_commit, - u64 *discarded) + struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1571,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } + discard_buckets_next_dev(c, s, ca); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - (*open)++; + s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { - (*need_journal_commit)++; + s->need_journal_commit++; + s->need_journal_commit_this_dev++; goto out; } @@ -1637,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, @@ -1655,14 +1733,14 @@ write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); - (*discarded)++; + count_event(c, bucket_discard); + s->discarded++; out: - (*seen)++; + s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1672,9 +1750,7 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - struct btree_iter iter; - struct bkey_s_c k; - u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1684,21 +1760,16 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded))); - - if (need_journal_commit * 2 > seen) - bch2_journal_flush_async(&c->journal, NULL); + for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + discard_buckets_next_dev(c, &s, NULL); - trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -1760,7 +1831,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1795,22 +1866,18 @@ err: static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); - struct bch_dev *ca; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - unsigned i; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, @@ -1884,8 +1951,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_bucket_do_index(trans, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1905,8 +1971,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1937,8 +2002,6 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; bool doing_init = false; @@ -1947,7 +2010,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) * every mount: */ - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2007,15 +2070,13 @@ out: void bch2_recalc_capacity(struct bch_fs *c) { - struct bch_dev *ca; u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i; lockdep_assert_held(&c->state_lock); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; @@ -2023,7 +2084,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* @@ -2079,11 +2140,9 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 bch2_min_rw_member_capacity(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; u64 ret = U64_MAX; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 73faf99a22..e7f7e842ee 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; } +static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +{ + return a.dirty_sectors + a.cached_sectors; +} + +static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) +{ + return a.dirty_sectors; +} + +static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, + struct bch_alloc_v4 a) +{ + int d = bch2_bucket_sectors_dirty(a); + + return d ? max(0, ca->mi.bucket_size - d) : 0; +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { if (!data_type_movable(a.data_type) || - a.dirty_sectors >= ca->mi.bucket_size) + !bch2_bucket_sectors_fragmented(ca, a)) return 0; - return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); + u64 d = bch2_bucket_sectors_dirty(a); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) @@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) @@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .key_invalid = bch2_alloc_v4_invalid, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) @@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); -int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h new file mode 100644 index 0000000000..b4ec20be93 --- /dev/null +++ b/fs/bcachefs/alloc_background_format.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H +#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 0e61579826..633d3223b3 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) ca->alloc_cursor = 0; rcu_read_unlock(); } @@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - if (!c->blocked_allocate_open_bucket) - c->blocked_allocate_open_bucket = local_clock(); - + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - if (c->blocked_allocate_open_bucket) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate_open_bucket], - c->blocked_allocate_open_bucket); - c->blocked_allocate_open_bucket = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, false); - if (c->blocked_allocate) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate], - c->blocked_allocate); - c->blocked_allocate = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, false); spin_unlock(&c->freelist_lock); return ob; @@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); err: - if (iter.trans && iter.path) + if (iter.path) set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -447,7 +435,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - citer.path->preserve = false; + set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -502,7 +490,7 @@ again: ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); break; } } @@ -567,8 +555,8 @@ again: goto again; } - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; @@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, devs = target_rw_devs(c, wp->data_type, target); /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); @@ -1539,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 23c0834a97..569b97904d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -3,6 +3,7 @@ #include "bbpos.h" #include "alloc_background.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -67,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); - prt_str(out, " "); + if (bch2_dev_exists2(c, k.k->p.inode)) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + } bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } @@ -136,15 +139,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_i_backpointer *bp_k, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { struct btree_iter bp_iter; struct bkey_s_c k; + struct bkey_i_backpointer *bp_k; int ret; + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, BTREE_ITER_INTENT| @@ -375,41 +393,45 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_btree_backpointer(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } -struct bpos_level { - unsigned level; - struct bpos pos; +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + +struct extents_to_bp_state { + struct bpos bucket_start; + struct bpos bucket_end; + struct bkey_buf last_flushed; }; static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, struct bpos bucket, struct bch_backpointer bp, - struct bkey_s_c orig_k, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; + struct bkey_buf tmp; int ret; - if (bpos_lt(bucket, bucket_start) || - bpos_gt(bucket, bucket_end)) + bch2_bkey_buf_init(&tmp); + + if (bpos_lt(bucket, s->bucket_start) || + bpos_gt(bucket, s->bucket_end)) return 0; if (!bch2_dev_bucket_exists(c, bucket)) @@ -424,13 +446,20 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (last_flushed->level != bp.level || - !bpos_eq(last_flushed->pos, orig_k.k->p)) { - last_flushed->level = bp.level; - last_flushed->pos = orig_k.k->p; + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { + if (bp.level) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } goto missing; @@ -439,6 +468,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); + bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; missing: @@ -448,8 +478,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || - c->opts.reconstruct_alloc || + if (c->opts.reconstruct_alloc || fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); @@ -457,25 +486,16 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + struct extents_to_bp_state *s, + enum btree_id btree, unsigned level, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek_all_levels(iter); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket_pos; @@ -484,12 +504,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); + ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) return ret; } @@ -498,47 +516,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans, } static int check_btree_root_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree_id, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + int *level) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree_id); struct btree_iter iter; struct btree *b; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; int ret; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); +retry: + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, + 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; - BUG_ON(b != btree_node_root(c, b)); - - k = bkey_i_to_s_c(&b->key); - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; - struct bch_backpointer bp; - - if (p.ptr.cached) - continue; + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry; + } - bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, - k, p, &bucket_pos, &bp); + *level = b->c.level; - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); - if (ret) - goto err; - } + k = bkey_i_to_s_c(&b->key); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -559,7 +562,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, btree_bytes(c)); + return div_u64(mem_bytes >> 1, c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, @@ -610,49 +613,57 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct bpos bucket_start, - struct bpos bucket_end) + struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; - enum btree_id btree_id; - struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; int ret = 0; - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); + btree_id++) { + int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - depth, - BTREE_ITER_ALL_LEVELS| - BTREE_ITER_PREFETCH); - - do { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_extent_to_backpointers(trans, &iter, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; - } while (!bch2_btree_iter_advance(&iter)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + check_btree_root_to_backpointers(trans, s, btree_id, &level)); + if (ret) + return ret; - bch2_trans_iter_exit(trans, &iter); + while (level >= depth) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + level, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + ret = bkey_err(k) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) + break; + if (bpos_eq(iter.pos, SPOS_MAX)) + break; + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); - if (ret) - break; + if (ret) + return ret; - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; + --level; + } } - return ret; + + return 0; } static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, @@ -714,40 +725,45 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct bpos start = POS_MIN, end; + struct extents_to_bp_state s = { .bucket_start = POS_MIN }; int ret; + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); if (ret) break; - if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + if ( bpos_eq(s.bucket_start, POS_MIN) && + !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + if (!bpos_eq(s.bucket_start, POS_MIN) || + !bpos_eq(s.bucket_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, start); + bch2_bpos_to_text(&buf, s.bucket_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, end); + bch2_bpos_to_text(&buf, s.bucket_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(trans, start, end); - if (ret || bpos_eq(end, SPOS_MAX)) + ret = bch2_check_extents_to_backpointers_pass(trans, &s); + if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) break; - start = bpos_successor(end); + s.bucket_start = bpos_successor(s.bucket_end); } bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -801,13 +817,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), &last_flushed_pos)); @@ -854,7 +868,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index ab866feeaf..327365a9fe 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" @@ -63,7 +64,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, @@ -72,28 +73,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_s_c orig_k, bool insert) { - struct bch_fs *c = trans->c; - struct bkey_i_backpointer *bp_k; - int ret; + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; + struct bkey_i_backpointer bp_k; - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); - bp_k->v = bp; + bkey_backpointer_init(&bp_k.k_i); + bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.v = bp; if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); + bp_k.k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k.k, 0); } - if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b62737fdf5..69d0d60d50 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> +#include <linux/refcount.h> #include <linux/rhashtable.h> #include <linux/rwsem.h> #include <linux/semaphore.h> @@ -223,9 +224,11 @@ #define race_fault(...) dynamic_fault("bcachefs:race") +#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) + #define trace_and_count(_c, _name, ...) \ do { \ - this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + count_event(_c, _name); \ trace_##_name(__VA_ARGS__); \ } while (0) @@ -262,46 +265,76 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +__printf(2, 3) +void __bch2_print(struct bch_fs *c, const char *fmt, ...); + +#define maybe_dev_to_fs(_c) _Generic((_c), \ + struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ + struct bch_fs *: (_c)) + +#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) + +#define bch2_print_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ - printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ - printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ - printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ - printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev(ca, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset(ca, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum(c, _inum, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev_ratelimited(ca, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +static inline bool should_print_err(int err) +{ + return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); +} #define bch_err_fn(_c, _ret) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ } while (0) +#define bch_err_fn_ratelimited(_c, _ret) \ +do { \ + if (should_print_err(_ret)) \ + bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ +} while (0) + #define bch_err_msg(_c, _ret, _msg, ...) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error " _msg " %s", __func__, \ ##__VA_ARGS__, bch2_err_str(_ret)); \ } while (0) @@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_merge) \ x(btree_node_sort) \ x(btree_node_read) \ + x(btree_node_read_done) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ @@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG() x(journal_flush_write) \ x(journal_noflush_write) \ x(journal_flush_seq) \ - x(blocked_journal) \ + x(blocked_journal_low_on_space) \ + x(blocked_journal_low_on_pin) \ + x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -428,6 +465,7 @@ enum bch_time_stats { #include "replicas_types.h" #include "subvolume_types.h" #include "super_types.h" +#include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -564,32 +602,35 @@ struct bch_dev { struct io_count __percpu *io_done; }; -enum { - /* startup: */ - BCH_FS_STARTED, - BCH_FS_MAY_GO_RW, - BCH_FS_RW, - BCH_FS_WAS_RW, - - /* shutdown: */ - BCH_FS_STOPPING, - BCH_FS_EMERGENCY_RO, - BCH_FS_GOING_RO, - BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_CLEAN_SHUTDOWN, - - /* fsck passes: */ - BCH_FS_FSCK_DONE, - BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ - BCH_FS_NEED_ANOTHER_GC, - - BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, - - /* errors: */ - BCH_FS_ERROR, - BCH_FS_TOPOLOGY_ERROR, - BCH_FS_ERRORS_FIXED, - BCH_FS_ERRORS_NOT_FIXED, +/* + * initial_gc_unfixed + * error + * topology error + */ + +#define BCH_FS_FLAGS() \ + x(started) \ + x(may_go_rw) \ + x(rw) \ + x(was_rw) \ + x(stopping) \ + x(emergency_ro) \ + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ + x(fsck_running) \ + x(initial_gc_unfixed) \ + x(need_another_gc) \ + x(need_delete_dead_snapshots) \ + x(error) \ + x(topology_error) \ + x(errors_fixed) \ + x(errors_not_fixed) + +enum bch_fs_flags { +#define x(n) BCH_FS_##n, + BCH_FS_FLAGS() +#undef x }; struct btree_debug { @@ -599,10 +640,11 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { + struct bch2_time_stats duration; struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned wb_updates_size; + unsigned journal_entries_size; unsigned max_mem; char *max_paths_text; }; @@ -664,7 +706,8 @@ struct btree_trans_buf { x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ - x(sysfs) + x(sysfs) \ + x(btree_write_buffer) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -689,6 +732,8 @@ struct bch_fs { struct super_block *vfs_sb; dev_t dev; char name[40]; + struct stdio_redirect *stdio; + struct task_struct *stdio_filter; /* ro/rw, add/remove/resize devices: */ struct rw_semaphore state_lock; @@ -699,6 +744,13 @@ struct bch_fs { #else struct percpu_ref writes; #endif + /* + * Analagous to c->writes, for asynchronous ops that don't necessarily + * need fs to be read-write + */ + refcount_t ro_ref; + wait_queue_head_t ro_ref_wait; + struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -1002,10 +1054,21 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; + /* never rewinds version of curr_recovery_pass */ + enum bch_recovery_pass recovery_pass_done; + struct semaphore online_fsck_mutex; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_going_ro, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_GOING_RO, &c->flags) && + return !test_bit(BCH_FS_going_ro, &c->flags) && atomic_long_inc_not_zero(&c->writes[ref]); #else return percpu_ref_tryget_live(&c->writes); @@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) if (atomic_long_read(&c->writes[i])) return; - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); #else percpu_ref_put(&c->writes); #endif } +static inline bool bch2_ro_ref_tryget(struct bch_fs *c) +{ + if (test_bit(BCH_FS_stopping, &c->flags)) + return false; + + return refcount_inc_not_zero(&c->ro_ref); +} + +static inline void bch2_ro_ref_put(struct bch_fs *c) +{ + if (refcount_dec_and_test(&c->ro_ref)) + wake_up(&c->ro_ref_wait); +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS @@ -1117,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c) return c->opts.block_size >> 9; } -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> 9; -} - static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { return c->btree_key_cache_btrees & (1U << btree); @@ -1158,6 +1240,27 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } +static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + return stdio; +} + +static inline unsigned metadata_replicas_required(struct bch_fs *c) +{ + return min(c->opts.metadata_replicas, + c->opts.metadata_replicas_required); +} + +static inline unsigned data_replicas_required(struct bch_fs *c) +{ + return min(c->opts.data_replicas, + c->opts.data_replicas_required); +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index fe78e87603..0668b682a2 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -307,6 +307,13 @@ struct bkey_i { struct bch_val v; }; +#define POS_KEY(_pos) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = _pos, \ +}) + #define KEY(_inode, _offset, _size) \ ((struct bkey) { \ .u64s = BKEY_U64s, \ @@ -410,600 +417,12 @@ struct bch_set { struct bch_val v; }; -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - /* 128 bits, sufficient for cryptographic MACs: */ struct bch_csum { __le64 lo; __le64 hi; } __packed __aligned(8); -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -/* Xattrs */ - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __packed __aligned(8); - -/* Bucket/allocation information: */ - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - __u64 fragmentation_lru; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -1014,154 +433,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -/* Quotas: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* Erasure coding */ - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - __u8 pad; - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -/* Reflink: */ - -struct bch_reflink_p { - struct bch_val v; - __le64 idx; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -/* Inline data */ - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -/* Subvolumes: */ - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - */ - __le32 parent; - __le32 pad; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -/* Snapshots */ - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; -}; - -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - /* LRU btree: */ struct bch_lru { @@ -1171,33 +442,6 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) -/* Logged operations btree: */ - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1223,6 +467,19 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "alloc_background_format.h" +#include "extents_format.h" +#include "reflink_format.h" +#include "ec_format.h" +#include "inode_format.h" +#include "dirent_format.h" +#include "xattr_format.h" +#include "quota_format.h" +#include "logged_ops_format.h" +#include "snapshot_format.h" +#include "subvolume_format.h" +#include "sb-counters_format.h" + enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, BCH_SB_FIELDS() @@ -1296,6 +553,7 @@ struct bch_member { __le64 errors[BCH_MEMBER_ERROR_NR]; __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; + __le64 seq; }; #define BCH_MEMBER_V1_BYTES 56 @@ -1442,7 +700,7 @@ struct bch_sb_field_replicas_v0 { struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); -struct bch_replicas_entry { +struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; @@ -1454,24 +712,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; + struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_disk_groups: */ @@ -1492,99 +733,6 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_counters */ - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -1662,69 +810,41 @@ struct bch_sb_field_downgrade { #define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) #define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) -#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) - /* * field 1: version name * field 2: BCH_VERSION(major, minor) * field 3: recovery passess required on upgrade */ #define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_btree_change, BCH_VERSION(0, 11), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot, BCH_VERSION(0, 12), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_backpointers, BCH_VERSION(0, 13), \ - RECOVERY_PASS_ALL_FSCK) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_2, BCH_VERSION(0, 15), \ - BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ - BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(reflink_p_fix, BCH_VERSION(0, 16), \ - BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ - x(subvol_dirent, BCH_VERSION(0, 17), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v2, BCH_VERSION(0, 18), \ - RECOVERY_PASS_ALL_FSCK) \ - x(freespace, BCH_VERSION(0, 19), \ - RECOVERY_PASS_ALL_FSCK) \ - x(alloc_v4, BCH_VERSION(0, 20), \ - RECOVERY_PASS_ALL_FSCK) \ - x(new_data_types, BCH_VERSION(0, 21), \ - RECOVERY_PASS_ALL_FSCK) \ - x(backpointers, BCH_VERSION(0, 22), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, BCH_VERSION(0, 23), \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, BCH_VERSION(0, 24), \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, BCH_VERSION(0, 25), \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, BCH_VERSION(0, 26), \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, BCH_VERSION(0, 27), \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, BCH_VERSION(0, 29), \ - RECOVERY_PASS_ALL_FSCK) \ - x(major_minor, BCH_VERSION(1, 0), \ - 0) \ - x(snapshot_skiplists, BCH_VERSION(1, 1), \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ - x(deleted_inodes, BCH_VERSION(1, 2), \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ - x(rebalance_work, BCH_VERSION(1, 3), \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + x(bkey_renumber, BCH_VERSION(0, 10)) \ + x(inode_btree_change, BCH_VERSION(0, 11)) \ + x(snapshot, BCH_VERSION(0, 12)) \ + x(inode_backpointers, BCH_VERSION(0, 13)) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ + x(snapshot_2, BCH_VERSION(0, 15)) \ + x(reflink_p_fix, BCH_VERSION(0, 16)) \ + x(subvol_dirent, BCH_VERSION(0, 17)) \ + x(inode_v2, BCH_VERSION(0, 18)) \ + x(freespace, BCH_VERSION(0, 19)) \ + x(alloc_v4, BCH_VERSION(0, 20)) \ + x(new_data_types, BCH_VERSION(0, 21)) \ + x(backpointers, BCH_VERSION(0, 22)) \ + x(inode_v3, BCH_VERSION(0, 23)) \ + x(unwritten_extents, BCH_VERSION(0, 24)) \ + x(bucket_gens, BCH_VERSION(0, 25)) \ + x(lru_v2, BCH_VERSION(0, 26)) \ + x(fragmentation_lru, BCH_VERSION(0, 27)) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ + x(snapshot_trees, BCH_VERSION(0, 29)) \ + x(major_minor, BCH_VERSION(1, 0)) \ + x(snapshot_skiplists, BCH_VERSION(1, 1)) \ + x(deleted_inodes, BCH_VERSION(1, 2)) \ + x(rebalance_work, BCH_VERSION(1, 3)) \ + x(member_seq, BCH_VERSION(1, 4)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, -#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, +#define x(t, n) bcachefs_metadata_version_##t = n, BCH_METADATA_VERSIONS() #undef x bcachefs_metadata_version_max @@ -1786,7 +906,8 @@ struct bch_sb { __le32 time_base_hi; __le32 time_precision; - __le64 flags[8]; + __le64 flags[7]; + __le64 write_time; __le64 features[2]; __le64 compat[2]; @@ -2153,7 +1274,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -2162,6 +1284,19 @@ enum { BCH_JSET_ENTRY_NR }; +static inline bool jset_entry_is_key(struct jset_entry *e) +{ + switch (e->type) { + case BCH_JSET_ENTRY_btree_keys: + case BCH_JSET_ENTRY_btree_root: + case BCH_JSET_ENTRY_overwrite: + case BCH_JSET_ENTRY_write_buffer_keys: + return true; + } + + return false; +} + /* * Journal sequence numbers can be blacklisted: bsets record the max sequence * number of all the journal entries they contain updates for, so that on @@ -2203,7 +1338,7 @@ struct jset_entry_usage { struct jset_entry_data_usage { struct jset_entry entry; __le64 v; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct jset_entry_clock { @@ -2224,8 +1359,8 @@ struct jset_entry_dev_usage { __le32 dev; __u32 pad; - __le64 buckets_ec; - __le64 _buckets_unavailable; /* No longer used */ + __le64 _buckets_ec; /* No longer used */ + __le64 _buckets_unavailable; /* No longer used */ struct jset_entry_dev_usage_type d[]; }; @@ -2239,7 +1374,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage struct jset_entry_log { struct jset_entry entry; u8 d[]; -} __packed; +} __packed __aligned(8); /* * On disk format for a journal entry: diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f05881f7e1..4b8fba754b 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -81,6 +81,11 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) #define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) +#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) + +#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) +#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state { __u64 dev; }; +#define BCH_DATA_OPS() \ + x(scrub, 0) \ + x(rereplicate, 1) \ + x(migrate, 2) \ + x(rewrite_old_nodes, 3) \ + x(drop_extra_replicas, 4) + enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_REWRITE_OLD_NODES = 3, - BCH_DATA_OP_NR = 4, +#define x(t, n) BCH_DATA_OP_##t = n, + BCH_DATA_OPS() +#undef x + BCH_DATA_OP_NR }; /* @@ -237,7 +248,7 @@ struct bch_ioctl_data_event { struct bch_replicas_usage { __u64 sectors; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; static inline struct bch_replicas_usage * @@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage { __u32 replica_entries_bytes; __u32 pad; - struct bch_replicas_usage replicas[0]; + struct bch_replicas_usage replicas[]; }; /* @@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage { __u64 buckets; __u64 sectors; __u64 fragmented; - } d[BCH_DATA_NR]; + } d[10]; +}; + +struct bch_ioctl_dev_usage_v2 { + __u64 dev; + __u32 flags; + __u8 state; + __u8 nr_data_types; + __u8 pad[6]; + + __u32 bucket_size; + __u64 nr_buckets; + + struct bch_ioctl_dev_usage_type d[]; }; /* @@ -365,4 +389,24 @@ struct bch_ioctl_subvolume { #define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) #define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) +/* + * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_offline { + __u64 flags; + __u64 opts; /* string */ + __u64 nr_devs; + __u64 devs[] __counted_by(nr_devs); +}; + +/* + * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_online { + __u64 flags; + __u64 opts; /* string */ +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index abdb05507d..76e79a15ba 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, next_key_bits -= 64; } - bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); if (!next_key_bits) break; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 761f5e33b1..5e52684764 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; } +static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); + + prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); +} + #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_invalid = key_type_cookie_invalid, \ + .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3a370b7087..03efe8ee56 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -28,10 +28,8 @@ struct bkey_ops { void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); - int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + int (*trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -78,84 +76,88 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -static inline int bch2_mark_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->atomic_trigger - ? ops->atomic_trigger(trans, btree, level, old, new, flags) - : 0; -} - enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_PREJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - + __BTREE_TRIGGER_NORUN, + __BTREE_TRIGGER_TRANSACTIONAL, + __BTREE_TRIGGER_ATOMIC, + __BTREE_TRIGGER_GC, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) +/* Don't run triggers at all */ #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) +/* + * If set, we're running transactional triggers as part of a transaction commit: + * triggers may generate new updates + * + * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, + * we're running atomic triggers during a transaction commit: we have our + * journal reservation, we're holding btree node write locks, and we know the + * transaction is going to commit (returning an error here is a fatal error, + * causing us to go emergency read-only) + */ +#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) +#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) + +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) + +/* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) + +/* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +/* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -static inline int bch2_trans_mark_key(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +static inline int bch2_key_trigger(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - return ops->trans_trigger - ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + return ops->trigger + ? ops->trigger(trans, btree, level, old, new, flags) : 0; } -static inline int bch2_trans_mark_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) +static inline int bch2_key_trigger_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); deleted.k.p = old.k->p; - return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, - BTREE_TRIGGER_OVERWRITE|flags); + return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), + BTREE_TRIGGER_OVERWRITE|flags); } -static inline int bch2_trans_mark_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_i *new, unsigned flags) +static inline int bch2_key_trigger_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s new, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); - deleted.k.p = new->k.p; + deleted.k.p = new.k->p; - return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index bb73ba9017..3fd1085b6c 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, _k = _n) { _n = bkey_p_next(_k); + if (!_k->u64s) { + printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, + _k->_data - i->_data); + break; + } + k = bkey_disassemble(b, _k, &uk); printbuf_reset(&buf); @@ -714,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; - unsigned j, cacheline = 1; + unsigned cacheline = 1; t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); @@ -817,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct bch_fs *c, struct btree *b, - struct btree_node_entry *bne) +void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) { struct bset *i = &bne->keys; struct bset_tree *t; - BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 632c2b8c54..79c77baaa3 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b, void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct bch_fs *, struct btree *, - struct btree_node_entry *); +void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_insert(struct btree *, struct btree_node_iter *, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 79495cd7a7..d7c81beac1 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvpmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvpfree(c->verify_ondisk, c->opts.btree_node_size); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, c); + trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; @@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } @@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: - trace_and_count(c, btree_cache_cannibalize_lock, c); + trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } @@ -673,7 +675,7 @@ err: mutex_unlock(&bc->lock); - trace_and_count(c, btree_cache_cannibalize, c); + trace_and_count(c, btree_cache_cannibalize, trans); goto out; } @@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (IS_ERR(b)) return b; - /* - * Btree nodes read in from disk should not have the accessed bit set - * initially, so that linear scans don't thrash the cache: - */ - clear_btree_node_accessed(b); - bkey_copy(&b->key, k); if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ @@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (path && sync) bch2_trans_unlock_noassert(trans); - bch2_btree_node_read(c, b, sync); + bch2_btree_node_read(trans, b, sync); if (!sync) return NULL; @@ -1039,7 +1035,7 @@ retry: goto retry; if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(c, NULL)) + !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) @@ -1087,7 +1083,7 @@ lock_node: EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return b; } @@ -1196,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index cfb80b201d..6d33885fdb 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); +int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); @@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b) _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -static inline size_t btree_bytes(struct bch_fs *c) +static inline size_t btree_buf_bytes(const struct btree *b) { - return c->opts.btree_node_size; + return 1UL << b->byte_order; } -static inline size_t btree_max_u64s(struct bch_fs *c) +static inline size_t btree_buf_max_u64s(const struct btree *b) { - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) +static inline size_t btree_max_u64s(const struct bch_fs *c) { - return btree_bytes(c) / PAGE_SIZE; + return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); } -static inline unsigned btree_blocks(struct bch_fs *c) +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> SECTOR_SHIFT; +} + +static inline unsigned btree_blocks(const struct bch_fs *c) { return btree_sectors(c) >> c->block_bits; } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 30ab78a245..1102995643 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -41,6 +41,14 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) +{ + return (struct bkey_s) {{{ + (struct bkey *) k.k, + (struct bch_val *) k.v + }}}; +} + static bool should_restart_for_topology_repair(struct bch_fs *c) { return c->opts.fix_errors != FSCK_FIX_no && @@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } } @@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } @@ -414,10 +422,9 @@ again: continue; } - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) break; - } ret = btree_repair_node_boundaries(c, b, prev, cur); @@ -482,10 +489,9 @@ again: false); ret = PTR_ERR_OR_ZERO(cur); - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) goto err; - } ret = bch2_btree_repair_topology_recurse(trans, cur); six_unlock_read(&cur->c.lock); @@ -591,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -609,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id g->data_type = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -631,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -643,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -658,13 +664,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->data_type], - bch2_data_types[data_type], + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { g->data_type = data_type; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { - bch_err_msg(c, ret, "allocating new key"); ret = -BCH_ERR_ENOMEM_gc_repair_key; + bch_err_msg(c, ret, "allocating new key"); goto err; } @@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - unsigned flags = - BTREE_TRIGGER_GC| - (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; deleted.p = k->k->p; @@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } ret = commit_do(trans, NULL, NULL, 0, - bch2_mark_key(trans, btree_id, level, old, *k, flags)); + bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b /* Continue marking when opted to not * fix the error: */ ret = 0; - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); continue; } } else if (ret) { @@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, fsck_err: six_unlock_read(&b->c.lock); - if (ret < 0) - bch_err_fn(c, ret); + bch_err_fn(c, ret); printbuf_exit(&buf); return ret; } @@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) : bch2_gc_btree(trans, i, initial, metadata_only); } - if (ret < 0) - bch_err_fn(c, ret); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, static void bch2_mark_superblocks(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_SB)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); mutex_unlock(&c->sb_lock); } @@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); @@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c, bool verify = !metadata_only && !c->opts.reconstruct_alloc && (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); - unsigned i, dev; + unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); @@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c, , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f #define copy_dev_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) + copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) #define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for_each_member_device(ca, c, dev) { + __for_each_member_device(c, ca) { struct bch_dev_usage *dst = ca->usage_base; struct bch_dev_usage *src = (void *) bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, @@ -1245,15 +1238,12 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_types[i]); + d[i].buckets, "%s buckets", bch2_data_type_str(i)); copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_types[i]); + d[i].sectors, "%s sectors", bch2_data_type_str(i)); copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_types[i]); + d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); } - - copy_dev_field(dev_usage_buckets_ec_wrong, - buckets_ec, "buckets_ec"); } { @@ -1263,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(fs_usage_hidden_wrong, - hidden, "hidden"); + b.hidden, "hidden"); copy_fs_field(fs_usage_btree_wrong, - btree, "btree"); + b.btree, "btree"); if (!metadata_only) { copy_fs_field(fs_usage_data_wrong, - data, "data"); + b.data, "data"); copy_fs_field(fs_usage_cached_wrong, - cached, "cached"); + b.cached, "cached"); copy_fs_field(fs_usage_reserved_wrong, - reserved, "reserved"); + b.reserved, "reserved"); copy_fs_field(fs_usage_nr_inodes_wrong, - nr_inodes,"nr_inodes"); + b.nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(fs_usage_persistent_reserved_wrong, @@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (metadata_only && @@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err: if (ca) percpu_ref_put(&ca->ref); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); percpu_up_write(&c->mark_lock); printbuf_exit(&buf); @@ -1317,9 +1306,6 @@ fsck_err: static int bch2_gc_start(struct bch_fs *c) { - struct bch_dev *ca = NULL; - unsigned i; - BUG_ON(c->usage_gc); c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), @@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c) return -BCH_ERR_ENOMEM_gc_start; } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); @@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c) static int bch2_gc_reset(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { free_percpu(ca->usage_gc); ca->usage_gc = NULL; } @@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, enum bch_data_type type; int ret; - if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) - return 1; - old = bch2_alloc_to_v4(k, &old_convert); new = *old; @@ -1437,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %s, should be %s", iter->pos.inode, iter->pos.offset, gc.gen, - bch2_data_types[new.data_type], - bch2_data_types[gc.data_type])) + bch2_data_type_str(new.data_type), + bch2_data_type_str(gc.data_type))) new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ @@ -1448,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ - bch2_data_types[gc.data_type], \ + bch2_data_type_str(gc.data_type), \ new._f, gc._f)) \ new._f = gc._f; \ @@ -1488,52 +1468,36 @@ fsck_err: static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; - unsigned i; int ret = 0; - for_each_member_device(ca, c, i) { - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(trans, &iter, k, metadata_only)); - - if (ret < 0) { - bch_err_fn(c, ret); + for_each_member_device(c, ca) { + ret = bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + POS(ca->dev_idx, ca->mi.nbuckets - 1), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + bch2_alloc_write_key(trans, &iter, k, metadata_only))); + if (ret) { percpu_ref_put(&ca->ref); break; } } - bch2_trans_put(trans); - return ret < 0 ? ret : 0; + bch_err_fn(c, ret); + return ret; } static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bucket *g; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned i; - int ret; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; - goto err; + return -BCH_ERR_ENOMEM_gc_alloc_start; } buckets->first_bucket = ca->mi.first_bucket; @@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = gc_bucket(ca, k.k->p.offset); - - a = bch2_alloc_to_v4(k, &a_convert); - - g->gen_valid = 1; - g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bucket *g = gc_bucket(ca, k.k->p.offset); - 0; - })); -err: - bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + g->gen_valid = 1; + g->gen = a->gen; + + if (metadata_only && + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; + } + + 0; + }))); + bch_err_fn(c, ret); return ret; } static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = gc_bucket_array(ca); struct bucket *g; @@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, if (!r->refcount) new->k.type = KEY_TYPE_deleted; else - *bkey_refcount(new) = cpu_to_le64(r->refcount); + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); } fsck_err: printbuf_exit(&buf); @@ -1643,64 +1603,52 @@ fsck_err: static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_reflink_key(trans, &iter, k, &idx)); - + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; - bch2_trans_put(trans); return ret; } static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct reflink_gc *r; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + const __le64 *refcount = bkey_refcount_c(k); - if (!refcount) - continue; + if (!refcount) + continue; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(trans, &iter); + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1768,24 +1716,15 @@ fsck_err: static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_stripes_key(trans, &iter, k)); - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_stripes_key(trans, &iter, k))); } static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) @@ -1848,7 +1787,7 @@ again: #endif c->gc_count++; - if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + if (test_bit(BCH_FS_need_another_gc, &c->flags) || (!iter && bch2_test_restart_gc)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); @@ -1860,7 +1799,7 @@ again: * XXX: make sure gens we fixed got saved */ bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + clear_bit(BCH_FS_need_another_gc, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); bch2_gc_stripes_reset(c, metadata_only); @@ -1900,9 +1839,7 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bkey_i *u; int ret; @@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i int bch2_gc_gens(struct bch_fs *c) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; u64 b, start_time = local_clock(); - unsigned i; int ret; /* @@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c) trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); - trans = bch2_trans_get(c); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); BUG_ON(ca->oldest_gen); @@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen[b] = gens->b[b]; } - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) if (btree_type_has_ptrs(i)) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - gc_btree_gens_key(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + gc_btree_gens_key(trans, &iter, k))); if (ret) goto err; } - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_PREFETCH, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_oldest_gen(trans, &iter, k))); if (ret) goto err; @@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); err: - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvfree(ca->oldest_gen); ca->oldest_gen = NULL; } - bch2_trans_put(trans); up_read(&c->gc_lock); mutex_unlock(&c->gc_gens_lock); + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); return ret; } @@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); - int ret; set_freezable(); @@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg) #if 0 ret = bch2_gc(c, false, false); #else - ret = bch2_gc_gens(c); + bch2_gc_gens(c); #endif - if (ret < 0) - bch_err_fn(c, ret); - debug_check_no_locks_held(); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5a720f0cd5..aa9b6cbe32 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(size > btree_bytes(c)); + BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); @@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - for (k = unwritten_whiteouts_start(c, b); - k != unwritten_whiteouts_end(c, b); + for (k = unwritten_whiteouts_start(b); + k != unwritten_whiteouts_end(b); k = bkey_p_next(k)) *--ptrs = k; @@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) verify_no_dups(b, new_whiteouts, (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - memcpy_u64s(unwritten_whiteouts_start(c, b), + memcpy_u64s(unwritten_whiteouts_start(b), new_whiteouts, b->whiteout_u64s); btree_bounce_free(c, bytes, used_mempool, new_whiteouts); @@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, } bytes = sorting_entire_node - ? btree_bytes(c) + ? btree_buf_bytes(b) : __vstruct_bytes(struct btree_node, u64s); out = btree_bounce_alloc(c, bytes, &used_mempool); @@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(bytes != btree_bytes(c)); + BUG_ON(bytes != btree_buf_bytes(b)); /* * Our temporary buffer is the same size as the btree node's @@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); @@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u", b->written); + prt_printf(out, "\n node offset %u/%u", + b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); prt_str(out, ": "); @@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } +static bool __bkey_valid(struct bch_fs *c, struct btree *b, + struct bset *i, struct bkey_packed *k) +{ + if (bkey_p_next(k) > vstruct_last(i)) + return false; + + if (k->format > KEY_FORMAT_CURRENT) + return false; + + struct printbuf buf = PRINTBUF; + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); + printbuf_exit(&buf); + return ret; +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, bool have_retry, bool *saw_error) @@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k != vstruct_last(i);) { struct bkey_s u; struct bkey tmp; + unsigned next_good_key; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, @@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "invalid bkey format %u", k->format)) + goto drop_this_key; /* XXX: validate k->u64s */ if (!write) @@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, c, NULL, b, i, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; + goto drop_this_key; } if (write) @@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, prt_printf(&buf, " > "); bch2_bkey_to_text(&buf, u.k); - bch2_dump_bset(c, b, i, 0); - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_out_of_order, - "%s", buf.buf)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "%s", buf.buf)) + goto drop_this_key; } prev = k; k = bkey_p_next(k); + continue; +drop_this_key: + next_good_key = k->u64s; + + if (!next_good_key || + (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && + version >= bcachefs_metadata_version_snapshot)) { + /* + * only do scanning if bch2_bkey_compat() has nothing to + * do + */ + + if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + for (next_good_key = 1; + next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; + next_good_key++) + if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + goto got_good_key; + + } + + /* + * didn't find a good key, have to truncate the rest of + * the bset + */ + next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; + } +got_good_key: + le16_add_cpu(&i->u64s, -next_good_key); + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); } fsck_err: printbuf_exit(&buf); @@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; - struct bch_extent_ptr *ptr; struct bset *i; bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && @@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; + u64 start_time = local_clock(); b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ @@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + bch2_bpos_to_text(&buf, b->data->min_key); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, b->data->max_key); + btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (seq %llx want %llx)", - b->data->keys.seq, bp->seq); + "got wrong btree node (want %llx got %llx)\n" + "got btree %s level %llu pos %s", + bp->seq, b->data->keys.seq, + bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data), + buf.buf); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, @@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(b->data->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + csum_bad = bch2_crc_cmp(b->data->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(bne->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + csum_bad = bch2_crc_cmp(bne->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1111,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ptr_written, b->written); } else { for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); + bset_byte_offset(b, bne) < btree_buf_bytes(b); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq && !bch2_journal_seq_is_blacklisted(c, @@ -1123,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "found bset signature after last bset"); } - sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1139,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); if (updated_range) bch2_btree_node_drop_keys_outside_node(b); @@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, out: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || @@ -1234,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work) rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_iter.bi_size = btree_buf_bytes(b); if (rb->have_ioref) { bio_set_dev(bio, ca->disk_sb.bdev); @@ -1462,7 +1512,7 @@ fsck_err: } if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_bytes(c)); + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; @@ -1528,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_bytes(c)), + buf_pages(ra->buf[i], btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1548,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool return 0; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b, +void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bool sync) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; int ret; - trace_and_count(c, btree_node_read, c, b); + trace_and_count(c, btree_node_read, trans, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1614,7 +1665,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_bytes(c)), + buf_pages(b->data, btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1628,7 +1679,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_bytes(c)); + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - + bch2_latency_acct(ca, rb->start_time, READ); btree_node_read_work(&rb->work); } else { submit_bio(bio); @@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); BUG_ON(IS_ERR(b)); @@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work) bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { + ret = -BCH_ERR_btree_write_all_failed; goto err; + } if (wbio->wbio.first_btree_write) { if (wbio->wbio.failed.nr) { @@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_reclaim| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW, + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw, !wbio->wbio.failed.nr)); if (ret) goto err; @@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - struct bch_extent_ptr *ptr; BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; bkey_copy(&tmp.k, &wbio->key); @@ -2022,8 +2074,8 @@ do_write: i->u64s = 0; sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_start(b), + unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; @@ -2199,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e0d7fa5b1d..e251cb6b96 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool, bool *); -void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 816ecc3375..3ef338df82 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -13,6 +13,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "journal_io.h" #include "replicas.h" #include "snapshot.h" #include "trace.h" @@ -21,8 +22,8 @@ #include <linux/prefetch.h> static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, - struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, + btree_path_idx_t, btree_path_idx_t); static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { @@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) #endif } -static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); +static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); +static void bch2_trans_srcu_lock(struct btree_trans *); static inline int __btree_path_cmp(const struct btree_path *l, enum btree_id r_btree_id, @@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans, void bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; + unsigned iter; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, iter) bch2_btree_path_verify(trans, path); } @@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, iter->update_path); - bch2_btree_path_verify(trans, iter->path); + bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) { struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) { + trans_for_each_path_inorder(trans, path, iter) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct bkey_packed *where) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) { + trans_for_each_path_with_node(trans, b, path, i) { __bch2_btree_path_fix_key_modified(path, b, where); bch2_btree_path_verify_level(trans, path, b->c.level); } @@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, { struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; + unsigned i; if (node_iter != &path->l[b->c.level].iter) { __bch2_btree_node_iter_fix(path, b, node_iter, t, @@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_path_with_node(trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked, i) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); @@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans, static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; trans_for_each_update(trans, i) if (!i->cached && @@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - struct btree_path *path; + struct btree_path *prev; + + BUG_ON(!btree_path_pos_in_node(path, b)); + + while ((prev = prev_btree_path(trans, path)) && + btree_path_pos_in_node(prev, b)) + path = prev; - trans_for_each_path(trans, path) - if (path->uptodate == BTREE_ITER_UPTODATE && - !path->cached && - btree_path_pos_in_node(path, b)) { + for (; + path && btree_path_pos_in_node(path, b); + path = next_btree_path(trans, path)) + if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); @@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) + trans_for_each_path_with_node(trans, b, path, i) __btree_path_level_init(path, b->c.level); bch2_trans_revalidate_updates_in_node(trans, b); @@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if (flags & BTREE_ITER_PREFETCH) + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); bch2_btree_and_journal_iter_exit(&jiter); @@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - if (flags & BTREE_ITER_PREFETCH) { + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) goto err; @@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + unsigned i; + int ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -963,7 +979,7 @@ retry_all: trans->restarted = 0; trans->last_restarted_ip = 0; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) path->should_be_locked = false; btree_trans_sort_paths(trans); @@ -977,7 +993,7 @@ retry_all: closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); } @@ -985,16 +1001,16 @@ retry_all: /* Now, redo traversals in correct order: */ i = 0; while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + btree_path_idx_t idx = trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ - if (path->uptodate) { - __btree_path_get(path, false); - ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); - __btree_path_put(path, false); + if (trans->paths[idx].uptodate) { + __btree_path_get(&trans->paths[idx], false); + ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); + __btree_path_put(&trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1013,7 +1029,7 @@ retry_all: * then failed to relock a path - that's fine. */ err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); trans->in_traverse_all = false; @@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * stashed in the iterator and returned from bch2_trans_exit(). */ int bch2_btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned flags, unsigned long trace_ip) { + struct btree_path *path = &trans->paths[path_idx]; unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } + path = &trans->paths[path_idx]; + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; @@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } } -static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, - bool intent) +static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, + bool intent) { - struct btree_path *new = btree_path_alloc(trans, src); - - btree_path_copy(trans, new, src); - __btree_path_get(new, intent); + btree_path_idx_t new = btree_path_alloc(trans, src); + btree_path_copy(trans, trans->paths + new, trans->paths + src); + __btree_path_get(trans->paths + new, intent); return new; } __flatten -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, - unsigned long ip) +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, + btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(path, intent); + __btree_path_put(trans->paths + path, intent); path = btree_path_clone(trans, path, intent); - path->preserve = false; + trans->paths[path].preserve = false; return path; } -struct btree_path * __must_check +btree_path_idx_t __must_check __bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip, int cmp) + btree_path_idx_t path_idx, struct bpos new_pos, + bool intent, unsigned long ip) { - unsigned level = path->level; + int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(!path->ref); + EBUG_ON(!trans->paths[path_idx].ref); - path = bch2_btree_path_make_mut(trans, path, intent, ip); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); + struct btree_path *path = trans->paths + path_idx; path->pos = new_pos; trans->paths_sorted = false; @@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - level = btree_path_up_until_good_node(trans, path, cmp); + unsigned level = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, level)) { struct btree_path_level *l = &path->l[level]; @@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } out: bch2_btree_path_verify(trans, path); - return path; + return path_idx; } /* Btree path: main interface: */ @@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr return NULL; } -static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) { - __bch2_btree_path_unlock(trans, path); - btree_path_list_remove(trans, path); - trans->paths_allocated &= ~(1ULL << path->idx); + __bch2_btree_path_unlock(trans, trans->paths + path); + btree_path_list_remove(trans, trans->paths + path); + __clear_bit(path, trans->paths_allocated); } -void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *dup; - - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); + struct btree_path *path = trans->paths + path_idx, *dup; if (!__btree_path_put(path, intent)) return; @@ -1322,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte if (path->should_be_locked && !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + (!dup || !bch2_btree_path_relock_norestart(trans, dup))) return; if (dup) { @@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte dup->should_be_locked |= path->should_be_locked; } - __bch2_path_free(trans, path); + __bch2_path_free(trans, path_idx); } -static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, +static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); - - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); prt_newline(buf); @@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - trans_for_each_wb_update(trans, wb) { - prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_id_str(wb->btree), - (void *) i->ip_allocated); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); - prt_newline(buf); - } + for (struct jset_entry *e = trans->journal_entries; + e != btree_trans_journal_entries_top(trans); + e = vstruct_next(e)) + bch2_journal_entry_to_text(buf, trans->c, e); printbuf_indent_sub(buf, 2); } @@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -noinline __cold -void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { + struct btree_path *path = trans->paths + path_idx; + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", - path->idx, path->ref, path->intent_ref, + path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', bch2_btree_id_str(path->btree_id), @@ -1434,14 +1438,13 @@ static noinline __cold void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, bool nosort) { - struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) - bch2_btree_path_to_text(out, path); + trans_for_each_path_idx_inorder(trans, iter) + bch2_btree_path_to_text(out, trans, iter.path_idx); } noinline __cold @@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; - - if (!s) - return; + size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { mutex_lock(&s->lock); - if (s->nr_max_paths < hweight64(trans->paths_allocated)) { - s->nr_max_paths = trans->nr_max_paths = - hweight64(trans->paths_allocated); + if (nr > s->nr_max_paths) { + s->nr_max_paths = nr; swap(s->max_paths_text, buf.buf); } mutex_unlock(&s->lock); @@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); - trans->nr_max_paths = hweight64(trans->paths_allocated); + trans->nr_paths_max = nr; +} + +noinline __cold +int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) +{ + if (trace_trans_restart_too_many_iters_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_trans_paths_to_text(&buf, trans); + trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_too_many_iters); + + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } static noinline void btree_path_overflow(struct btree_trans *trans) { bch2_dump_trans_paths_updates(trans); - panic("trans path overflow\n"); + bch_err(trans->c, "trans path overflow"); } -static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static noinline void btree_paths_realloc(struct btree_trans *trans) { - struct btree_path *path; - unsigned idx; + unsigned nr = trans->nr_paths * 2; + + void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + sizeof(struct btree_trans_paths) + + nr * sizeof(struct btree_path) + + nr * sizeof(btree_path_idx_t) + 8 + + nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); + + unsigned long *paths_allocated = p; + memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); + p += BITS_TO_LONGS(nr) * sizeof(unsigned long); + + p += sizeof(struct btree_trans_paths); + struct btree_path *paths = p; + *trans_paths_nr(paths) = nr; + memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); + p += nr * sizeof(struct btree_path); + + btree_path_idx_t *sorted = p; + memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); + p += nr * sizeof(btree_path_idx_t) + 8; + + struct btree_insert_entry *updates = p; + memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); + + unsigned long *old = trans->paths_allocated; - if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) - btree_path_overflow(trans); + rcu_assign_pointer(trans->paths_allocated, paths_allocated); + rcu_assign_pointer(trans->paths, paths); + rcu_assign_pointer(trans->sorted, sorted); + rcu_assign_pointer(trans->updates, updates); - idx = __ffs64(~trans->paths_allocated); + trans->nr_paths = nr; + + if (old != trans->_paths_allocated) + kfree_rcu_mightsleep(old); +} + +static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, + btree_path_idx_t pos) +{ + btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); + + if (unlikely(idx == trans->nr_paths)) { + if (trans->nr_paths == BTREE_ITER_MAX) { + btree_path_overflow(trans); + return 0; + } + + btree_paths_realloc(trans); + } /* * Do this before marking the new path as allocated, since it won't be * initialized yet: */ - if (unlikely(idx > trans->nr_max_paths)) + if (unlikely(idx > trans->nr_paths_max)) bch2_trans_update_max_paths(trans); - trans->paths_allocated |= 1ULL << idx; + __set_bit(idx, trans->paths_allocated); - path = &trans->paths[idx]; - path->idx = idx; + struct btree_path *path = &trans->paths[idx]; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->alloc_seq++; - btree_path_list_add(trans, pos, path); + btree_path_list_add(trans, pos, idx); trans->paths_sorted = false; - return path; + return idx; } -struct btree_path *bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) +btree_path_idx_t bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) { - struct btree_path *path, *path_pos = NULL; + struct btree_path *path; bool cached = flags & BTREE_ITER_CACHED; bool intent = flags & BTREE_ITER_INTENT; - int i; + struct trans_for_each_path_inorder_iter iter; + btree_path_idx_t path_pos = 0, path_idx; bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, cached, @@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, level) > 0) break; - path_pos = path; + path_pos = iter.path_idx; } if (path_pos && - path_pos->cached == cached && - path_pos->btree_id == btree_id && - path_pos->level == level) { - __btree_path_get(path_pos, intent); - path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + trans->paths[path_pos].cached == cached && + trans->paths[path_pos].btree_id == btree_id && + trans->paths[path_pos].level == level) { + __btree_path_get(trans->paths + path_pos, intent); + path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = trans->paths + path_idx; } else { - path = btree_path_alloc(trans, path_pos); - path_pos = NULL; + path_idx = btree_path_alloc(trans, path_pos); + path = trans->paths + path_idx; __btree_path_get(path, intent); path->pos = pos; @@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - for (i = 0; i < ARRAY_SIZE(path->l); i++) + for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; @@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, if (locks_want > path->locks_want) bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path; + return path_idx; } struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) @@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter) int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; int ret; - iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(trans->paths + iter->path); return 0; } @@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(trans->paths[iter->path].cached); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; - b = btree_path_node(iter->path, iter->path->level); + struct btree_path *path = btree_iter_path(trans, iter); + b = btree_path_node(path, path->level); if (!b) goto out; @@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - struct btree_path *path = iter->path; struct btree *b = NULL; int ret; + EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); + struct btree_path *path = btree_iter_path(trans, iter); + /* already at end? */ if (!btree_path_node(path, path->level)) return NULL; @@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * Haven't gotten to the end of the parent node: go back down to * the next child node */ - path = iter->path = - bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); + iter->path = bch2_btree_path_set_pos(trans, iter->path, + bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + path = btree_iter_path(trans, iter); btree_path_set_level_down(trans, path, iter->min_depth); - ret = bch2_btree_path_traverse(trans, path, iter->flags); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; + path = btree_iter_path(trans, iter); b = path->l[path->level].b; } @@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1799,23 +1862,15 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; - } else { - if (!btree_path_node(iter->path, iter->path->level)) - return true; + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); - iter->advanced = true; - return false; - } + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) } static noinline -struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) +void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - struct btree_insert_entry *i; - struct bkey_i *ret = NULL; + struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - trans_for_each_update(iter->trans, i) { - if (i->btree_id < iter->btree_id) - continue; - if (i->btree_id > iter->btree_id) - break; - if (bpos_lt(i->k->k.p, iter->path->pos)) - continue; - if (i->key_cache_already_flushed) - continue; - if (!ret || bpos_lt(i->k->k.p, ret->k.p)) - ret = i->k; - } + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_le(i->k->k.p, iter->pos) && + bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } +} - return ret; +static noinline +void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bpos end = path_l(path)->b->key.k.p; + + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_ge(i->k->k.p, path->pos) && + bpos_le(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +static noinline +void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - return iter->flags & BTREE_ITER_WITH_UPDATES - ? __bch2_btree_trans_peek_updates(iter) - : NULL; + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_eq(i->k->k.p, iter->pos)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, struct bpos end_pos) { - struct bkey_i *k; - - if (bpos_lt(iter->path->pos, iter->journal_pos)) - iter->journal_idx = 0; + struct btree_path *path = btree_iter_path(trans, iter); - k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, - iter->path->level, - iter->path->pos, - end_pos, - &iter->journal_idx); - - iter->journal_pos = k ? k->k.p : end_pos; - return k; + return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); } static noinline struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); if (k) { iter->k = k->k; @@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + k.k ? k.k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED) ?: - bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { iter->k = u; k.k = &iter->k; @@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bkey_i *next_update; struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(btree_iter_path(trans, iter)->cached); bch2_btree_iter_verify(iter); while (1) { @@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - l = path_l(iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ @@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(path); k = btree_path_level_peek_all(trans->c, l, &iter->k); @@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); - next_update = btree_trans_peek_updates(iter); - - if (next_update && - bpos_le(next_update->k.p, - k.k ? k.k->p : l->b->key.k.p)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - } + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_updates(trans, iter, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } bch2_btree_iter_verify_entry_exit(iter); @@ -2100,10 +2162,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e goto end; if (iter->update_path && - !bkey_eq(iter->update_path->pos, k.k->p)) { + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && @@ -2123,7 +2185,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, @@ -2179,14 +2241,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { - ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(iter->update_path); + btree_path_set_should_be_locked(trans->paths + iter->update_path); } if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) @@ -2208,103 +2270,6 @@ end: } /** - * bch2_btree_iter_peek_all_levels() - returns the first key greater than or - * equal to iterator's current position, returning keys from every level of the - * btree. For keys at different levels of the btree that compare equal, the key - * from the lower level (leaf) is returned first. - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - struct bkey_s_c k; - int ret; - - EBUG_ON(iter->path->cached); - bch2_btree_iter_verify(iter); - BUG_ON(iter->path->level < iter->min_depth); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - /* Already at end? */ - if (!btree_path_node(iter->path, iter->path->level)) { - k = bkey_s_c_null; - goto out_no_locked; - } - - k = btree_path_level_peek_all(trans->c, - &iter->path->l[iter->path->level], &iter->k); - - /* Check if we should go up to the parent node: */ - if (!k.k || - (iter->advanced && - bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { - iter->pos = path_l(iter->path)->b->key.k.p; - btree_path_set_level_up(trans, iter->path); - iter->advanced = false; - continue; - } - - /* - * Check if we should go back down to a leaf: - * If we're not in a leaf node, we only return the current key - * if it exactly matches iter->pos - otherwise we first have to - * go back to the leaf: - */ - if (iter->path->level != iter->min_depth && - (iter->advanced || - !k.k || - !bpos_eq(iter->pos, k.k->p))) { - btree_path_set_level_down(trans, iter->path, iter->min_depth); - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - /* Check if we should go to the next key: */ - if (iter->path->level == iter->min_depth && - iter->advanced && - k.k && - bpos_eq(iter->pos, k.k->p)) { - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - if (iter->advanced && - iter->path->level == iter->min_depth && - !bpos_eq(k.k->p, iter->pos)) - iter->advanced = false; - - BUG_ON(iter->advanced); - BUG_ON(!k.k); - break; - } - - iter->pos = k.k->p; - btree_path_set_should_be_locked(iter->path); -out_no_locked: - bch2_btree_iter_verify(iter); - - return k; -} - -/** * bch2_btree_iter_next() - returns first key greater than iterator's current * position * @iter: iterator to peek from @@ -2330,14 +2295,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; - struct btree_path *saved_path = NULL; struct bkey_s_c k; struct bkey saved_k; const struct bch_val *saved_v; + btree_path_idx_t saved_path = 0; int ret; - EBUG_ON(iter->path->cached || iter->path->level); - EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + EBUG_ON(btree_iter_path(trans, iter)->cached || + btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) return bkey_s_c_err(-EIO); @@ -2361,14 +2326,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) goto out_no_locked; } - k = btree_path_level_peek(trans, iter->path, - &iter->path->l[0], &iter->k); + struct btree_path *path = btree_iter_path(trans, iter); + + k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, iter->path, - &iter->path->l[0], &iter->k); + k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2384,13 +2353,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; - saved_path = NULL; + saved_path = 0; iter->k = saved_k; k.v = saved_v; goto got_key; } - if (bch2_snapshot_is_ancestor(iter->trans->c, + if (bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { if (saved_path) @@ -2398,6 +2367,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; } @@ -2414,10 +2384,11 @@ got_key: continue; } + btree_path_set_should_be_locked(path); break; - } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ - search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); @@ -2434,8 +2405,6 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; - - btree_path_set_should_be_locked(iter->path); out_no_locked: if (saved_path) bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); @@ -2470,8 +2439,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); - EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2495,13 +2463,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_CACHED) || !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { - struct bkey_i *next_update; + k = bkey_s_c_null; - if ((next_update = btree_trans_peek_updates(iter)) && - bpos_eq(next_update->k.p, iter->pos)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - goto out; + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) { + bch2_btree_trans_peek_slot_updates(trans, iter, &k); + if (k.k) + goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && @@ -2516,7 +2484,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; } else { @@ -2526,7 +2494,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_IS_EXTENTS) end.offset = U64_MAX; - EBUG_ON(iter->path->level); + EBUG_ON(btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; @@ -2572,7 +2540,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2619,17 +2587,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) struct btree_path *path; unsigned i; - BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + BUG_ON(trans->sorted[path->sorted_idx] != i); } for (i = 0; i < trans->nr_sorted; i++) { unsigned idx = trans->sorted[i]; - EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(!test_bit(idx, trans->paths_allocated)); BUG_ON(trans->paths[idx].sorted_idx != i); } } @@ -2637,12 +2605,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) static void btree_trans_verify_sorted(struct btree_trans *trans) { struct btree_path *path, *prev = NULL; - unsigned i; + struct trans_for_each_path_inorder_iter iter; if (!bch2_debug_check_iterators) return; - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (prev && btree_path_cmp(prev, path) > 0) { __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); @@ -2699,42 +2667,40 @@ out: static inline void btree_path_list_remove(struct btree_trans *trans, struct btree_path *path) { - unsigned i; - EBUG_ON(path->sorted_idx >= trans->nr_sorted); #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS trans->nr_sorted--; memmove_u64s_down_small(trans->sorted + path->sorted_idx, trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); #else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; - - path->sorted_idx = U8_MAX; } static inline void btree_path_list_add(struct btree_trans *trans, - struct btree_path *pos, - struct btree_path *path) + btree_path_idx_t pos, + btree_path_idx_t path_idx) { - unsigned i; + struct btree_path *path = trans->paths + path_idx; - path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path->idx; + trans->sorted[path->sorted_idx] = path_idx; #else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; btree_trans_verify_sorted_refs(trans); @@ -2751,9 +2717,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); - iter->path = NULL; - iter->update_path = NULL; - iter->key_cache_path = NULL; + iter->path = 0; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -2784,41 +2751,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, iter->min_depth = depth; - BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->path->level != depth); - BUG_ON(iter->min_depth != depth); + struct btree_path *path = btree_iter_path(trans, iter); + BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(path->level != depth); + BUG_ON(iter->min_depth != depth); } void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) { + struct btree_trans *trans = src->trans; + *dst = *src; if (src->path) - __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) - __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); - dst->key_cache_path = NULL; + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = 0; } void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { + struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); + unsigned old_bytes = trans->mem_bytes; + unsigned new_bytes = roundup_pow_of_two(new_top); int ret; void *new_mem; void *p; - trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + struct btree_transaction_stats *s = btree_trans_stats(trans); + s->max_mem = max(s->max_mem, new_bytes); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; kfree(trans->mem); } @@ -2838,7 +2810,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } @@ -2860,8 +2832,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) if (trans->srcu_held) { struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->cached && !btree_node_locked(path, 0)) path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); @@ -2871,7 +2844,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) } } -void bch2_trans_srcu_lock(struct btree_trans *trans) +static void bch2_trans_srcu_lock(struct btree_trans *trans) { if (!trans->srcu_held) { trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); @@ -2893,14 +2866,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans) u32 bch2_trans_begin(struct btree_trans *trans) { struct btree_path *path; + unsigned i; u64 now; bch2_trans_reset_updates(trans); trans->restart_count++; trans->mem_top = 0; + trans->journal_entries = NULL; - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { path->should_be_locked = false; /* @@ -2917,15 +2892,21 @@ u32 bch2_trans_begin(struct btree_trans *trans) * iterators if we do that */ if (!path->ref && !path->preserve) - __bch2_path_free(trans, path); + __bch2_path_free(trans, i); else path->preserve = false; } now = local_clock(); + + if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && + time_after64(now, trans->last_begin_time + 10)) + __bch2_time_stats_update(&btree_trans_stats(trans)->duration, + trans->last_begin_time, now); + if (!trans->restarted && (need_resched() || - now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { drop_locks_do(trans, (cond_resched(), 0)); now = local_clock(); } @@ -2944,32 +2925,11 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) - return trans; - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(&trans->paths, 0, sizeof(trans->paths)); - return trans; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; unsigned bch2_trans_get_fn_idx(const char *fn) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) if (!bch2_btree_transaction_fns[i] || bch2_btree_transaction_fns[i] == fn) { bch2_btree_transaction_fns[i] = fn; @@ -2977,76 +2937,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn) } pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return i; + return 0; } struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { struct btree_trans *trans; - struct btree_transaction_stats *s; - trans = bch2_trans_alloc(c); - - memset(trans, 0, sizeof(*trans)); - trans->c = c; - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) - ? bch2_btree_transaction_fns[fn_idx] : NULL; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - closure_init_stack(&trans->ref); - - s = btree_trans_stats(trans); - if (s && s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - - if (!unlikely(trans->mem)) { - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - trans->mem_bytes = BTREE_TRANS_MEM_MAX; - } else { - trans->mem_bytes = expected_mem_bytes; + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) { + memset(trans, 0, offsetof(struct btree_trans, list)); + goto got_trans; } } - if (s) { - trans->nr_max_paths = s->nr_max_paths; - trans->wb_updates_size = s->wb_updates_size; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + memset(trans, 0, sizeof(*trans)); + closure_init_stack(&trans->ref); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct btree_trans *pos; + pid_t pid = current->pid; + + trans->locking_wait.task = current; - seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); /* * We'd much prefer to be stricter here and completely * disallow multiple btree_trans in the same thread - * but the data move path calls bch2_write when we * already have a btree_trans initialized. */ - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + BUG_ON(pos_task && + pid == pos_task->pid && bch2_trans_locked(pos)); - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } } - list_add_tail(&trans->list, &c->btree_trans_list); + } + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - seqmutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); +got_trans: + trans->c = c; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); + trans->nr_paths = ARRAY_SIZE(trans->_paths); + trans->paths_allocated = trans->_paths_allocated; + trans->sorted = trans->_sorted; + trans->paths = trans->_paths; + trans->updates = trans->_updates; + + *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; + + trans->paths_allocated[0] = 1; + + if (fn_idx < BCH_TRANSACTIONS_NR) { + trans->fn = bch2_btree_transaction_fns[fn_idx]; + + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; + + if (s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + if (likely(trans->mem)) + trans->mem_bytes = expected_mem_bytes; + } + + trans->nr_paths_max = s->nr_max_paths; + trans->journal_entries_size = s->journal_entries_size; } + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; return trans; } @@ -3055,14 +3031,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) goto leaked; return; leaked: bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) printk(KERN_ERR " btree %s %pS\n", bch2_btree_id_str(path->btree_id), @@ -3075,26 +3052,14 @@ leaked: void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { - struct btree_insert_entry *i; struct bch_fs *c = trans->c; - struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - - closure_sync(&trans->ref); - - if (s) - s->max_mem = max(s->max_mem, trans->mem_max); - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); - trans->nr_updates = 0; + __btree_path_put(trans->paths + i->path, true); + trans->nr_updates = 0; + trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3103,8 +3068,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - kfree(trans->extra_journal_entries.data); - if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -3117,6 +3080,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; + trans->paths = NULL; + + if (paths_allocated != trans->_paths_allocated) + kfree_rcu_mightsleep(paths_allocated); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else @@ -3125,8 +3095,16 @@ void bch2_trans_put(struct btree_trans *trans) /* Userspace doesn't have a real percpu implementation: */ if (IS_ENABLED(__KERNEL__)) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - if (trans) + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + mempool_free(trans, &c->btree_trans_pool); + } } static void __maybe_unused @@ -3154,24 +3132,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_path *path; struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; + struct task_struct *task = READ_ONCE(trans->locking_wait.task); unsigned l, idx; + /* before rcu_read_lock(): */ + bch2_printbuf_make_room(out, 4096); + if (!out->nr_tabstops) { printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 32); } - prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); + prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + out->atomic++; + + struct btree_path *paths = rcu_dereference(trans->paths); + if (!paths) + goto out; + + unsigned long *paths_allocated = trans_paths_allocated(paths); - trans_for_each_path_safe(trans, path, idx) { + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { + struct btree_path *path = paths + idx; if (!path->nodes_locked) continue; prt_printf(out, " path %u %c l=%u %s:", - path->idx, + idx, path->cached ? 'c' : 'b', path->level, bch2_btree_id_str(path->btree_id)); @@ -3199,6 +3191,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); } +out: + --out->atomic; + rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3207,15 +3202,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) struct btree_trans *trans; int cpu; + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) { + struct btree_trans *trans = + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + kfree(trans); + } + free_percpu(c->btree_trans_bufs); + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); if (trans) panic("%s leaked btree_trans\n", trans->fn); - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); - free_percpu(c->btree_trans_bufs); - for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { @@ -3236,6 +3242,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { + bch2_time_stats_init(&s->duration); bch2_time_stats_init(&s->lock_hold_times); mutex_init(&s->lock); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index eaffced4c1..24772538e4 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans) __bch2_btree_trans_sort_paths(trans); } -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned idx) +static inline unsigned long *trans_paths_nr(struct btree_path *paths) { - u64 l; - - if (idx == BTREE_ITER_MAX) - return NULL; - - l = trans->paths_allocated >> idx; - if (!l) - return NULL; - - idx += __ffs64(l); - EBUG_ON(idx >= BTREE_ITER_MAX); - EBUG_ON(trans->paths[idx].idx != idx); - return &trans->paths[idx]; + return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; } -#define trans_for_each_path_from(_trans, _path, _start) \ - for (_path = __trans_next_path((_trans), _start); \ - (_path); \ - _path = __trans_next_path((_trans), (_path)->idx + 1)) - -#define trans_for_each_path(_trans, _path) \ - trans_for_each_path_from(_trans, _path, 0) - -static inline struct btree_path * -__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +static inline unsigned long *trans_paths_allocated(struct btree_path *paths) { - u64 l; + unsigned long *v = trans_paths_nr(paths); + return v - BITS_TO_LONGS(*v); +} - if (*idx == BTREE_ITER_MAX) - return NULL; +#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ + for (_idx = _start; \ + (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ + _idx++) - l = trans->paths_allocated >> *idx; - if (!l) - return NULL; +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned *idx) +{ + unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; + /* + * Open coded find_next_bit(), because + * - this is fast path, we can't afford the function call + * - and we know that nr_paths is a multiple of BITS_PER_LONG, + */ + while (*idx < trans->nr_paths) { + unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); + if (v) { + *idx += __ffs(v); + return trans->paths + *idx; + } + + *idx += BITS_PER_LONG; + *idx &= ~(BITS_PER_LONG - 1); + w++; + } - *idx += __ffs64(l); - EBUG_ON(*idx >= BTREE_ITER_MAX); - return &trans->paths[*idx]; + return NULL; } /* * This version is intended to be safe for use on a btree_trans that is owned by * another thread, for bch2_btree_trans_to_text(); */ -#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ +#define trans_for_each_path_from(_trans, _path, _idx, _start) \ for (_idx = _start; \ - (_path = __trans_next_path_safe((_trans), &_idx)); \ + (_path = __trans_next_path((_trans), &_idx)); \ _idx++) -#define trans_for_each_path_safe(_trans, _path, _idx) \ - trans_for_each_path_safe_from(_trans, _path, _idx, 0) +#define trans_for_each_path(_trans, _path, _idx) \ + trans_for_each_path_from(_trans, _path, _idx, 1) static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { @@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru : NULL; } -#define trans_for_each_path_inorder(_trans, _path, _i) \ - for (_i = 0; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ - _i++) +#define trans_for_each_path_idx_inorder(_trans, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) + +struct trans_for_each_path_inorder_iter { + btree_path_idx_t sorted_idx; + btree_path_idx_t path_idx; +}; + +#define trans_for_each_path_inorder(_trans, _path, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _path = (_trans)->paths + _iter.path_idx, \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) #define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ for (_i = trans->nr_sorted - 1; \ @@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path, static inline struct btree_path * __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned idx) + unsigned *idx) { - struct btree_path *path = __trans_next_path(trans, idx); + struct btree_path *path; - while (path && !__path_has_node(path, b)) - path = __trans_next_path(trans, path->idx + 1); + while ((path = __trans_next_path(trans, idx)) && + !__path_has_node(path, b)) + (*idx)++; return path; } -#define trans_for_each_path_with_node(_trans, _b, _path) \ - for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ - (_path); \ - _path = __trans_next_path_with_node((_trans), (_b), \ - (_path)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ + for (_iter = 1; \ + (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ + _iter++) -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, - bool, unsigned long); +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, + bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, + btree_path_idx_t path, bool intent, unsigned long ip) { - if (path->ref > 1 || path->preserve) + if (trans->paths[path].ref > 1 || + trans->paths[path].preserve) path = __bch2_btree_path_make_mut(trans, path, intent, ip); - path->should_be_locked = false; + trans->paths[path].should_be_locked = false; return path; } -struct btree_path * __must_check -__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, - struct bpos, bool, unsigned long, int); +btree_path_idx_t __must_check +__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, + struct bpos, bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip) + btree_path_idx_t path, struct bpos new_pos, + bool intent, unsigned long ip) { - int cmp = bpos_cmp(new_pos, path->pos); - - return cmp - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + return !bpos_eq(new_pos, trans->paths[path].pos) + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) : path; } -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, unsigned, unsigned long); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) + btree_path_idx_t path, unsigned flags) { - if (path->uptodate < BTREE_ITER_NEED_RELOCK) + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); } -int __must_check bch2_btree_path_traverse(struct btree_trans *, - struct btree_path *, unsigned); -struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); @@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); -void bch2_path_put(struct btree_trans *, struct btree_path *, bool); +void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); @@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); -void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); - static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { return bch2_btree_iter_peek_upto(iter, SPOS_MAX); @@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + struct btree_trans *trans = iter->trans; + if (unlikely(iter->update_path)) - bch2_path_put(iter->trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) new_pos.snapshot = iter->snapshot; @@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (flags & BTREE_ITER_ALL_LEVELS) - flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; @@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - memset(iter, 0, sizeof(*iter)); - iter->trans = trans; - iter->btree_id = btree_id; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k.p = pos; - + iter->trans = trans; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->btree_id = btree_id; + iter->min_depth = 0; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k = POS_KEY(pos); + iter->journal_idx = 0; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif @@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - if (!iter->trans->restarted) - iter->path->preserve = false; + struct btree_trans *trans = iter->trans; + + if (!trans->restarted) + btree_iter_path(trans, iter)->preserve = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) { - size = roundup(size, 8); + size = round_up(size, 8); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(*_val), _val) void bch2_trans_srcu_unlock(struct btree_trans *); -void bch2_trans_srcu_lock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); @@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - BUG_ON(flags & BTREE_ITER_ALL_LEVELS); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } @@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter * static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : - flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * return bch2_btree_iter_peek_slot(iter); } +int __bch2_btree_trans_too_many_iters(struct btree_trans *); + static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { - trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); - } + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + return __bch2_btree_trans_too_many_iters(trans); return 0; } -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); - -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -static inline struct bkey_s_c -__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_upto_type(iter, end, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - +/* + * goto instead of loop, so that when used inside for_each_btree_key2() + * break/continue work correctly + */ #define lockrestart_do(_trans, _do) \ ({ \ + __label__ transaction_restart; \ u32 _restart_count; \ int _ret2; \ +transaction_restart: \ + _restart_count = bch2_trans_begin(_trans); \ + _ret2 = (_do); \ \ - do { \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ + if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ + goto transaction_restart; \ \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ _ret2; \ }) @@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key2(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ - if (!(_k).k) \ - break; \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) -#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ - if (!(_k).k) \ - break; \ - \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ - if (!(_k).k) { \ - _ret3 = 0; \ - break; \ - } \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_rewind(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) @@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ - &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_s_c k; -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +#define for_each_btree_key_old(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) @@ -863,24 +805,25 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ - for (; \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for (; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + SPOS_MAX, _flags, _k, _ret) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + +/* + * This should not be used in a fastpath, without first trying _do in + * nonblocking mode - it will cause excessive transaction restarts and + * potentially livelocking: + */ #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ @@ -912,10 +855,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _p; \ }) -/* new multiple iterator interface: */ - void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ec52f50d24..719a94a849 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); } +/* Returns first non-overwritten key >= search key: */ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) @@ -86,12 +87,26 @@ search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) return NULL; - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) + if (k->overwritten) { + (*idx)++; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) return k->k; (*idx)++; @@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys) src = dst = keys->d; while (src < keys->d + keys->nr) { while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) + !journal_key_cmp(src, src + 1)) src++; *dst++ = *src++; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1b7a5668df..74e52fd28a 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (ret) goto out; - ck = (void *) c_iter.path->l[0].b; + ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; if (!ck) goto out; @@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (journal_seq && ck->journal.seq != journal_seq) goto out; + trans->journal_res.seq = ck->journal.seq; + /* - * Since journal reclaim depends on us making progress here, and the - * allocator/copygc depend on journal reclaim making progress, we need - * to be using alloc reserves: + * If we're at the end of the journal, we really want to free up space + * in the journal right away - we don't want to pin that old journal + * sequence number with a new btree node write, we want to re-journal + * the update */ + if (ck->journal.seq == journal_last_seq(j)) + commit_flags |= BCH_WATERMARK_reclaim; + + if (ck->journal.seq != journal_last_seq(j) || + j->watermark == BCH_WATERMARK_stripe) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - (ck->journal.seq == journal_last_seq(j) - ? BCH_WATERMARK_reclaim - : 0)| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| commit_flags); bch2_fs_fatal_err_on(ret && @@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_journal_pin_drop(j, &ck->journal); - BUG_ON(!btree_node_locked(c_iter.path, 0)); + struct btree_path *path = btree_iter_path(trans, &c_iter); + BUG_ON(!btree_node_locked(path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } } else { struct btree_path *path2; + unsigned i; evict: - trans_for_each_path(trans, path2) - if (path2 != c_iter.path) + trans_for_each_path(trans, path2, i) + if (path2 != path) __bch2_btree_path_unlock(trans, path2); - bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + bch2_btree_node_lock_write_nofail(trans, path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } - mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free_fast(&c->btree_key_cache, ck); } @@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = commit_do(trans, NULL, NULL, 0, + ret = lockrestart_do(trans, btree_key_cache_flush_pos(trans, key, seq, - BTREE_INSERT_JOURNAL_RECLAIM, false)); + BCH_TRANS_COMMIT_journal_reclaim, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); @@ -742,28 +751,12 @@ unlock: return ret; } -/* - * Flush and evict a key from the key cache: - */ -int bch2_btree_key_cache_flush(struct btree_trans *trans, - enum btree_id id, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey_cached_key key = { id, pos }; - - /* Fastpath - assume it won't be found: */ - if (!bch2_btree_key_cache_find(c, id, pos)) - return 0; - - return btree_key_cache_flush_pos(trans, key, 0, 0, true); -} - bool bch2_btree_insert_key_cached(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; @@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)) + test_bit(BCH_FS_was_rw, &c->flags)) panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", atomic_long_read(&bc->nr_dirty)); diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index be3acde2ca..e6b2cd0dd2 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_insert_entry *); -int bch2_btree_key_cache_flush(struct btree_trans *, - enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, struct btree_path *); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 3d48834d09..6843974423 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, { struct btree_path *path; struct six_lock_count ret; + unsigned i; memset(&ret, 0, sizeof(ret)); if (IS_ERR_OR_NULL(b)) return ret; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path != skip && &path->l[level].b->c == b) { int t = btree_node_locked_type(path, level); @@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) prt_printf(out, "Found lock cycle (%u entries):", g->nr); prt_newline(out); - for (i = g->g; i < g->g + g->nr; i++) + for (i = g->g; i < g->g + g->nr; i++) { + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); + if (!task) + continue; + bch2_btree_trans_to_text(out, i->trans); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); + } } static noinline void print_chain(struct printbuf *out, struct lock_graph *g) @@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { + struct task_struct *task = i->trans->locking_wait.task; if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + prt_printf(out, "%u ", task ?task->pid : 0); } prt_newline(out); } @@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g) return false; } +static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + count_event(c, trans_restart_would_deadlock); + + if (trace_trans_restart_would_deadlock_enabled()) { + struct printbuf buf = PRINTBUF; + + buf.atomic++; + print_cycle(&buf, g); + + trace_trans_restart_would_deadlock(trans, buf.buf); + printbuf_exit(&buf); + } +} + static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { - trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + trace_would_deadlock(g, i->trans); return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); } else { i->trans->lock_must_abort = true; @@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } @@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) struct lock_graph g; struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; - struct btree_path *path; - unsigned path_idx; - int ret; + btree_path_idx_t path_idx; + int ret = 0; + + g.nr = 0; if (trans->lock_must_abort) { if (cycle) return -1; - trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + trace_would_deadlock(&g, trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); } - g.nr = 0; lock_graph_down(&g, trans); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + if (cycle) + cycle->atomic++; next: if (!g.nr) - return 0; + goto out; top = &g.g[g.nr - 1]; - trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + struct btree_path *paths = rcu_dereference(top->trans->paths); + if (!paths) + goto up; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), + path_idx, top->path_idx) { + struct btree_path *path = paths + path_idx; if (!path->nodes_locked) continue; @@ -348,18 +386,23 @@ next: ret = lock_graph_descend(&g, trans, cycle); if (ret) - return ret; + goto out; goto next; } raw_spin_unlock(&b->lock.wait_lock); } } - +up: if (g.nr > 1 && cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; +out: + if (cycle) + --cycle->atomic; + rcu_read_unlock(); + return ret; } int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) @@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b) { struct btree_path *linked; - unsigned i; + unsigned i, iter; int ret; /* @@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, * already taken are no longer needed: */ - trans_for_each_path(trans, linked) { + trans_for_each_path(trans, linked, iter) { if (!linked->nodes_locked) continue; @@ -588,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, } __flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; @@ -599,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } @@ -624,8 +666,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; @@ -648,8 +688,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) + if (!path->cached && !trans->in_traverse_all) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path(trans, linked, i) if (linked != path && linked->cached == path->cached && linked->btree_id == path->btree_id && @@ -657,6 +700,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->locks_want = new_locks_want; btree_path_get_locks(trans, linked, true, NULL); } + } return false; } @@ -665,7 +709,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - unsigned l; + unsigned l, old_locks_want = path->locks_want; if (trans->restarted) return; @@ -689,8 +733,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, bch2_btree_path_verify_locks(path); - path->downgrade_seq++; - trace_path_downgrade(trans, _RET_IP_, path); + trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } /* Btree transaction locking: */ @@ -698,40 +741,70 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (trans->restarted) return; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; + if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { - trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + !btree_path_get_locks(trans, path, false, &f)) { + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", + f.l, path->l[f.l].lock_seq); + if (IS_ERR_OR_NULL(f.b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); + } else { + prt_printf(&buf, "%u", f.b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f.b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_relock); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } + } + return 0; } int bch2_trans_relock_notrace(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + !bch2_btree_path_relock_norestart(trans, path)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } return 0; @@ -740,16 +813,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) void bch2_trans_unlock_noassert(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } @@ -762,8 +837,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->nodes_locked) return true; return false; @@ -809,8 +885,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) void bch2_trans_verify_locks(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_verify_locks(path); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 11b0a2c8cd..4bd72c855d 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - struct btree_transaction_stats *s = btree_trans_stats(trans); - - if (s) - __bch2_time_stats_update(&s->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); + __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); #endif } @@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree *b) { struct btree_path *linked; + unsigned i; EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); @@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); @@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, enum btree_node_locked_type want) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->lock, (enum six_lock_type) want); @@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || @@ -314,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *, /* relock: */ -bool bch2_btree_path_relock_norestart(struct btree_trans *, - struct btree_path *, unsigned long); +bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); int __bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long); @@ -355,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 12907beda9..30d69a6d13 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -12,6 +12,7 @@ #include "errcode.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "snapshot.h" @@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #endif } -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) { - return i->path->l + i->level; + return (trans->paths + i->path)->l + i->level; } static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; } static inline bool same_leaf_as_next(struct btree_trans *trans, struct btree_insert_entry *i) { return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; } inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, @@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre if (same_leaf_as_prev(trans, i)) continue; - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); @@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre static inline int bch2_trans_lock_write(struct btree_trans *trans) { - struct btree_insert_entry *i; - EBUG_ON(trans->write_locked); trans_for_each_update(trans, i) { if (same_leaf_as_prev(trans, i)) continue; - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) return trans_lock_write_fail(trans, i); if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trans->write_locked = true; @@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) static inline void bch2_trans_unlock_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_btree_node_unlock_write_inlined(trans, + trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; } } @@ -142,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -163,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); + push_whiteout(b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -287,7 +283,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, journal_seq); if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_btree_node_dirty_acct(c, b); } @@ -311,10 +307,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); + struct btree_path *path = trans->paths + i->path; + + BUG_ON(!bpos_eq(i->k->k.p, path->pos)); + BUG_ON(i->cached != path->cached); + BUG_ON(i->level != path->level); + BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && @@ -349,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans) static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) + if (!bch2_btree_node_insert_fits(b, u64s)) return -BCH_ERR_btree_insert_btree_node_full; return 0; @@ -361,8 +357,6 @@ noinline static int btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned new_u64s) { - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; struct bkey_cached *ck = (void *) path->l[0].b; struct bkey_i *new_k; int ret; @@ -372,7 +366,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); return -BCH_ERR_ENOMEM_btree_key_cache_insert; } @@ -401,7 +395,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; - struct btree_insert_entry *i; unsigned new_u64s; struct bkey_i *new_k; @@ -409,7 +402,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c) && - !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + !(flags & BCH_TRANS_COMMIT_journal_reclaim)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -422,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); @@ -452,25 +445,15 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) - return 0; - - if (old_ops->atomic_trigger == new_ops->atomic_trigger) { - ret = bch2_mark_key(trans, i->btree_id, i->level, - old, bkey_i_to_s_c(new), + if (old_ops->trigger == new_ops->trigger) { + ret = bch2_key_trigger(trans, i->btree_id, i->level, + old, bkey_i_to_s(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - - _deleted.p = i->path->pos; - - ret = bch2_mark_key(trans, i->btree_id, i->level, - deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, i->btree_id, i->level, - old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); + ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + bkey_i_to_s(new), flags) ?: + bch2_key_trigger_old(trans, i->btree_id, i->level, + old, flags); } return ret; @@ -488,6 +471,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; verify_update_old_key(trans, i); @@ -497,19 +481,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ if (!i->insert_trigger_run && !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger) { + old_ops->trigger == new_ops->trigger) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE| - i->flags) ?: 1; + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; } else if (!overwrite && !i->insert_trigger_run) { i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { return 0; } @@ -551,7 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + struct btree_insert_entry *btree_id_start = trans->updates; unsigned btree_id = 0; int ret = 0; @@ -597,10 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc @@ -608,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && + gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) - break; + return ret; } } - return ret; + return 0; } static inline int @@ -624,8 +604,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; int ret; @@ -650,23 +628,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, u64s += i->k->k.u64s; ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); + ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); if (ret) { *stopped_at = i; return ret; } - } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; + i->k->k.needs_whiteout = false; + } /* * Don't get journal reservation until after we know insert will * succeed: */ - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { ret = bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_NONBLOCK); @@ -675,8 +651,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (unlikely(trans->journal_transaction_names)) journal_transaction_name(trans); - } else { - trans->journal_res.seq = c->journal.replay_journal_seq; } /* @@ -685,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; @@ -698,13 +672,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); h = trans->hooks; while (h) { @@ -715,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); + if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); if (ret) goto fatal_err; } @@ -727,16 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -765,33 +725,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bkey_copy((struct bkey_i *) entry->start, i->k); } - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy((struct bkey_i *) entry->start, &wb->k); - } + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->journal_entries, + trans->journal_entries_u64s); + + trans->journal_res.offset += trans->journal_entries_u64s; + trans->journal_res.u64s -= trans->journal_entries_u64s; if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } trans_for_each_update(trans, i) { - i->k->k.needs_whiteout = false; + struct btree_path *path = trans->paths + i->path; if (!i->cached) { - u64 seq = trans->journal_res.seq; - - if (i->flags & BTREE_UPDATE_PREJOURNAL) - seq = i->seq; - - bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); } else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_key_cache_drop(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } } @@ -806,14 +760,8 @@ revert_fs_usage: static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - trans_for_each_update(trans, i) bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, @@ -841,6 +789,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, return -EINVAL; } +static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, + struct jset_entry *i) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_journal_entry_to_text(&buf, c, i); + prt_newline(&buf); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + + return -EINVAL; +} + +static int bch2_trans_commit_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -849,7 +824,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0, u64s_delta = 0; trans_for_each_update(trans, i) { @@ -884,13 +858,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, NULL); + trans->journal_pin, + bch2_trans_commit_journal_pin_flush); /* * Drop journal reservation after dropping write locks, since dropping * the journal reservation may kick off a journal write: */ - bch2_journal_res_put(&c->journal, &trans->journal_res); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + bch2_journal_res_put(&c->journal, &trans->journal_res); return ret; } @@ -916,7 +892,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, case -BCH_ERR_btree_insert_btree_node_full: ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, @@ -927,7 +904,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag */ - if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; @@ -950,30 +927,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -982,8 +935,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(flags & BTREE_INSERT_NOWAIT) && - (flags & BTREE_INSERT_NOFAIL), c, + (flags & BCH_TRANS_COMMIT_no_enospc), c, "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; @@ -995,8 +947,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret; - if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || - test_bit(BCH_FS_STARTED, &c->flags)) + if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || + test_bit(BCH_FS_started, &c->flags)) return -BCH_ERR_erofs_trans_commit; ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); @@ -1016,7 +968,6 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0; trans_for_each_update(trans, i) { @@ -1030,18 +981,15 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { + struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; - struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; int ret = 0; if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) + !trans->journal_entries_u64s) goto out_reset; - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); ret = bch2_trans_commit_run_triggers(trans); if (ret) @@ -1051,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct printbuf buf = PRINTBUF; enum bkey_invalid_flags invalid_flags = 0; - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), @@ -1064,47 +1012,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) return ret; } - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags))) + ret = bch2_trans_commit_journal_entry_invalid(trans, i); + + if (ret) + return ret; + } + + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; } - if (!(flags & BTREE_INSERT_NOCHECK_RW) && + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { ret = bch2_trans_commit_get_rw_cold(trans, flags); if (ret) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_u64s = trans->journal_entries_u64s; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); + struct btree_path *path = trans->paths + i->path; + + EBUG_ON(!path->should_be_locked); - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + ret = bch2_btree_path_upgrade(trans, path, i->level + 1); if (unlikely(ret)) goto out; - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + EBUG_ON(!btree_node_intent_locked(path, i->level)); if (i->key_cache_already_flushed) continue; @@ -1120,22 +1073,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans->journal_u64s += jset_u64s(i->old_k.u64s); } - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { + if (trans->extra_disk_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, - (flags & BTREE_INSERT_NOFAIL) + trans->extra_disk_res, + (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto err; } retry: + errored_at = NULL; bch2_trans_verify_not_in_restart(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ bch2_trans_verify_locks(trans); @@ -1145,7 +1097,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: if (!ret) @@ -1154,9 +1106,21 @@ out_reset: return ret; err: - ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); if (ret) goto out; + /* + * We might have done another transaction commit in the error path - + * i.e. btree write buffer flush - which will have made use of + * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is + * how the journal sequence number to pin is passed in - so we must + * restart: + */ + if (flags & BCH_TRANS_COMMIT_no_journal_res) { + ret = -BCH_ERR_transaction_restart_nested; + goto out; + } + goto retry; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 60453ba86c..4a5a64499e 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -185,33 +185,32 @@ struct btree_node_iter { * Iterate over all possible positions, synthesizing deleted keys for holes: */ static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2; +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -#define __BTREE_ITER_FLAGS_END 16 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; +#define __BTREE_ITER_FLAGS_END 15 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -223,13 +222,12 @@ enum btree_path_uptodate { #define TRACK_PATH_ALLOCATED #endif +typedef u16 btree_path_idx_t; + struct btree_path { - u8 idx; - u8 sorted_idx; + btree_path_idx_t sorted_idx; u8 ref; u8 intent_ref; - u32 alloc_seq; - u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) */ struct btree_iter { struct btree_trans *trans; - struct btree_path *path; - struct btree_path *update_path; - struct btree_path *key_cache_path; + btree_path_idx_t path; + btree_path_idx_t update_path; + btree_path_idx_t key_cache_path; enum btree_id btree_id:8; - unsigned min_depth:3; - unsigned advanced:1; + u8 min_depth; /* btree_iter_copy starts here: */ u16 flags; @@ -306,7 +303,6 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; - struct bpos journal_pos; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif @@ -354,16 +350,16 @@ struct btree_insert_entry { * to the size of the key being overwritten in the btree: */ u8 old_btree_u64s; + btree_path_idx_t path; struct bkey_i *k; - struct btree_path *path; - u64 seq; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; unsigned long ip_allocated; }; -#define BTREE_ITER_MAX 64 +#define BTREE_ITER_INITIAL 64 +#define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -377,25 +373,30 @@ struct btree_trans_commit_hook { #define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 +struct btree_trans_paths { + unsigned long nr_paths; + struct btree_path paths[]; +}; + struct btree_trans { struct bch_fs *c; - const char *fn; - struct closure ref; - struct list_head list; - u64 last_begin_time; - u8 lock_may_not_fail; - u8 lock_must_abort; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; + unsigned long *paths_allocated; + struct btree_path *paths; + btree_path_idx_t *sorted; + struct btree_insert_entry *updates; - int srcu_idx; + void *mem; + unsigned mem_top; + unsigned mem_bytes; + btree_path_idx_t nr_sorted; + btree_path_idx_t nr_paths; + btree_path_idx_t nr_paths_max; u8 fn_idx; - u8 nr_sorted; u8 nr_updates; - u8 nr_wb_updates; - u8 wb_updates_size; + u8 lock_must_abort; + bool lock_may_not_fail:1; bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; @@ -407,41 +408,59 @@ struct btree_trans { bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; + + u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; unsigned long srcu_lock_time; - /* - * For when bch2_trans_update notices we'll be splitting a compressed - * extent: - */ - unsigned extra_journal_res; - unsigned nr_max_paths; - - u64 paths_allocated; - - unsigned mem_top; - unsigned mem_max; - unsigned mem_bytes; - void *mem; - - u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path paths[BTREE_ITER_MAX]; - struct btree_insert_entry updates[BTREE_ITER_MAX]; - struct btree_write_buffered_key *wb_updates; + const char *fn; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; /* update path: */ + u16 journal_entries_u64s; + u16 journal_entries_size; + struct jset_entry *journal_entries; + struct btree_trans_commit_hook *hooks; - darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; + + struct bch_fs_usage_base fs_usage_delta; + unsigned journal_u64s; + unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; + + /* Entries before this are zeroed out on every bch2_trans_get() call */ + + struct list_head list; + struct closure ref; + + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; + struct btree_trans_paths trans_paths; + struct btree_path _paths[BTREE_ITER_INITIAL]; + btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; + struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; }; +static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return trans->paths + iter->path; +} + +static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return iter->key_cache_path + ? trans->paths + iter->key_cache_path + : NULL; +} + #define BCH_BTREE_WRITE_TYPES() \ x(initial, 0) \ x(init_next_bset, 1) \ @@ -637,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_reflink)| \ BIT_ULL(BKEY_TYPE_btree)) -#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ +#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ (BIT_ULL(BKEY_TYPE_alloc)| \ BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ @@ -645,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -722,4 +741,9 @@ enum btree_node_sibling { btree_next_sib, }; +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 2fd3c8cc6f..c3ff365acc 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, } static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, +bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, struct bkey_i *, enum btree_update_flags, unsigned long ip); @@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, */ if (nr_splits > 1 && (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors * (nr_splits - 1); + trans->extra_disk_res += compressed_sectors * (nr_splits - 1); if (front_split) { update = bch2_bkey_make_mut_noupdate(trans, old); @@ -339,21 +339,22 @@ err: } static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, struct btree_insert_entry *i, enum btree_update_flags flags, unsigned long ip) { - struct btree_path *btree_path; struct bkey k; int ret; - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; + struct btree_path *btree_path = trans->paths + path_idx; + /* * The old key in the insert entry might actually refer to an existing * key in the btree that has been deleted from cache and not yet @@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, i->flags |= BTREE_TRIGGER_NORUN; btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); + ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: - bch2_path_put(trans, btree_path, true); + bch2_path_put(trans, path_idx, true); return ret; } static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, struct bkey_i *k, enum btree_update_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; - u64 seq = 0; int cmp; + struct btree_path *path = trans->paths + path_idx; EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(trans->nr_updates >= trans->nr_paths); EBUG_ON(!bpos_eq(k->k.p, path->pos)); - /* - * The transaction journal res hasn't been allocated at this point. - * That occurs at commit time. Reuse the seq field to pass in the seq - * of a prejournaled key. - */ - if (flags & BTREE_UPDATE_PREJOURNAL) - seq = trans->journal_res.seq; - n = (struct btree_insert_entry) { .flags = flags, .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, .cached = path->cached, - .path = path, + .path = path_idx, .k = k, - .seq = seq, .ip_allocated = ip, }; @@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: */ - trans_for_each_update(trans, i) { + for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { cmp = btree_insert_entry_cmp(&n, i); if (cmp <= 0) break; @@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, i->cached = n.cached; i->k = n.k; i->path = n.path; - i->seq = n.seq; i->ip_allocated = n.ip_allocated; } else { array_insert_item(trans->updates, trans->nr_updates, @@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, } } - __btree_path_get(i->path, true); + __btree_path_get(trans->paths + i->path, true); /* * If a key is present in the key cache, it must also exist in the @@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * work: */ if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); + return flush_new_cached_update(trans, i, flags, ip); return 0; } @@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, struct btree_iter *iter, struct btree_path *path) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); + + if (!key_cache_path || + !key_cache_path->should_be_locked || + !bpos_eq(key_cache_path->pos, iter->pos)) { struct bkey_cached *ck; int ret; @@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, iter->flags & BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); if (unlikely(ret)) return ret; - ck = (void *) iter->key_cache_path->l[0].b; + ck = (void *) trans->paths[iter->key_cache_path].l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); } return 0; @@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { - struct btree_path *path = iter->update_path ?: iter->path; + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; if (iter->flags & BTREE_ITER_IS_EXTENTS) @@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter /* * Ensure that updates to cached btrees go to the key cache: */ + struct btree_path *path = trans->paths + path_idx; if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && @@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (ret) return ret; - path = iter->key_cache_path; + path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); } -/* - * Add a transaction update for a key that has already been journaled. - */ -int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, - struct btree_iter *iter, struct bkey_i *k, - enum btree_update_flags flags) -{ - trans->journal_res.seq = seq; - return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| - BTREE_UPDATE_PREJOURNAL); -} - -static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) { struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); int ret = PTR_ERR_OR_ZERO(n); @@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } + unsigned new_top = trans->journal_entries_u64s + u64s; + unsigned old_size = trans->journal_entries_size; - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; + if (new_top > trans->journal_entries_size) { + trans->journal_entries_size = roundup_pow_of_two(new_top); - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } - - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; + btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; } - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; + struct jset_entry *n = + bch2_trans_kmalloc_nomemzero(trans, + trans->journal_entries_size * sizeof(u64)); + if (IS_ERR(n)) + return ERR_CAST(n); - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; + if (trans->journal_entries) + memcpy(n, trans->journal_entries, old_size * sizeof(u64)); + trans->journal_entries = n; - return 0; + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s = new_top; + return e; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, @@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(trans->c, &disk_res); err: /* @@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { - struct bkey_i *k; - int ret = 0; + struct bkey_i k; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; + bkey_init(&k.k); + k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k.k.p = pos; - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; - - return bch2_trans_update_buffered(trans, btree, k); + return bch2_trans_update_buffered(trans, btree, &k); } -__printf(2, 0) -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) { - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); if (ret) - goto err; + return ret; - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf->buf, buf->pos); + return 0; } __printf(3, 0) @@ -908,16 +815,32 @@ static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) { - int ret; + struct printbuf buf = PRINTBUF; + prt_vprintf(&buf, fmt, args); + + unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); + + int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); + if (ret) + goto err; + + struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); + journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf.buf, buf.pos); + c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + BCH_TRANS_COMMIT_lazy_rw|commit_flags, + __bch2_trans_log_msg(trans, &buf, u64s)); } - +err: + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 9816d22865..b9382b7b28 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, struct bkey_i *, u64); -enum btree_insert_flags { +#define BCH_TRANS_COMMIT_FLAGS() \ + x(no_enospc, "don't check for enospc") \ + x(no_check_rw, "don't attempt to take a ref on c->writes") \ + x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ + x(no_journal_res, "don't take a journal reservation, instead " \ + "pin journal entry referred to by trans->journal_res.seq") \ + x(journal_reclaim, "operation required for journal reclaim; may return error" \ + "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + +enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ - __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, - __BTREE_INSERT_NOCHECK_RW, - __BTREE_INSERT_LAZY_RW, - __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RECLAIM, - __BTREE_INSERT_NOWAIT, - __BTREE_INSERT_GC_LOCK_HELD, - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, + __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, +#define x(n, ...) __BCH_TRANS_COMMIT_##n, + BCH_TRANS_COMMIT_FLAGS() +#undef x }; -/* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) - -#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) -#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) - -/* Insert is for journal replay - don't get journal reservations: */ -#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) - -/* Insert is being called from journal reclaim path: */ -#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) - -/* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) -#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) - -#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) +enum bch_trans_commit_flags { +#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), + BCH_TRANS_COMMIT_FLAGS() +#undef x +}; int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, @@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + return bch2_btree_bit_mod(trans, btree, pos, false); +} + int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, struct bpos, struct bpos); @@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_buffered(struct btree_trans *, - enum btree_id, struct bkey_i *); + +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); + +static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) +{ + return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +} + +static inline struct jset_entry * +bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +{ + if (!trans->journal_entries || + trans->journal_entries_u64s + u64s > trans->journal_entries_size) + return __bch2_trans_jset_entry_alloc(trans, u64s); + + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s += u64s; + return e; +} + +int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); + +static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); + bkey_copy(e->start, k); + return 0; +} void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); @@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans, bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ - for ((_i) = (_trans)->updates; \ + for (struct btree_insert_entry *_i = (_trans)->updates; \ (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_wb_update(_trans, _i) \ - for ((_i) = (_trans)->wb_updates; \ - (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ - (_i)++) - static inline void bch2_trans_reset_updates(struct btree_trans *trans) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); - trans->extra_journal_res = 0; trans->nr_updates = 0; - trans->nr_wb_updates = 0; - trans->wb_updates = NULL; + trans->journal_entries_u64s = 0; trans->hooks = NULL; - trans->extra_journal_entries.nr = 0; + trans->extra_disk_res = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 239fcc3c7c..4530b14ff2 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,24 +25,24 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_path *, struct btree *, + btree_path_idx_t, struct btree *, struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) +static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) { - struct btree_path *path; - - path = bch2_path_get(trans, btree_id, pos, level + 1, level, + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, BTREE_ITER_NOPRESERVE| BTREE_ITER_INTENT, _RET_IP_); - path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; bch2_btree_path_downgrade(trans, path); __bch2_btree_path_unlock(trans, path); - return path; + return path_idx; } /* Debug code: */ @@ -159,14 +159,16 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); } /* Btree node freeing/allocation: */ -static void __btree_node_free(struct bch_fs *c, struct btree *b) +static void __btree_node_free(struct btree_trans *trans, struct btree *b) { - trace_and_count(c, btree_node_free, c, b); + struct bch_fs *c = trans->c; + + trace_and_count(c, btree_node_free, trans, b); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned level = b->c.level; + unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; struct btree_path *path; - unsigned level = b->c.level; + unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -278,7 +280,8 @@ retry: writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, - c->opts.metadata_replicas_required, + min(res->nr_replicas, + c->opts.metadata_replicas_required), watermark, 0, cl, &wp); if (unlikely(ret)) return ERR_PTR(ret); @@ -363,7 +366,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, c, b); + trace_and_count(c, btree_node_alloc, trans, b); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -453,7 +456,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -466,7 +469,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, unsigned flags, struct closure *cl) { - struct bch_fs *c = as->c; struct btree *b; unsigned interior; int ret = 0; @@ -476,11 +478,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: - * - * BTREE_INSERT_NOWAIT only applies to btree node allocation, not - * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret; @@ -488,9 +487,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, - flags & BTREE_INSERT_NOWAIT ? NULL : cl, - interior, flags); + b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, + interior, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -500,7 +498,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, } } err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -559,24 +557,20 @@ static void btree_update_add_key(struct btree_update *as, static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { - struct bkey_i *k; - int ret; - - ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - memcpy(&darray_top(trans->extra_journal_entries), - as->journal_entries, - as->journal_u64s * sizeof(u64)); - trans->extra_journal_entries.nr += as->journal_u64s; + memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); trans->journal_pin = &as->journal; for_each_keylist_key(&as->old_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); + ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -584,7 +578,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, for_each_keylist_key(&as->new_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); + ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -645,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as) */ ret = commit_do(trans, &as->disk_res, &journal_seq, BCH_WATERMARK_reclaim| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_journal_reclaim, btree_update_nodes_written_trans(trans, as)); bch2_trans_unlock(trans); @@ -655,10 +650,11 @@ static void btree_update_nodes_written(struct btree_update *as) "%s(): error %s", __func__, bch2_err_str(ret)); err: if (as->b) { - struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); + btree_path_idx_t path_idx = get_unlocked_mut_path(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -728,7 +724,7 @@ err: btree_node_write_if_need(c, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path, true); + bch2_path_put(trans, path_idx, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -815,6 +811,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } +static int bch2_update_reparent_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + static void btree_update_reparent(struct btree_update *as, struct btree_update *child) { @@ -825,7 +827,8 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, + bch2_update_reparent_journal_pin_flush); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -934,6 +937,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b b->ob.v[--b->ob.nr]; } +static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_updates - redirect @b's @@ -985,11 +994,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, * when the new nodes are persistent and reachable on disk: */ w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1039,7 +1050,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct bch_fs *c = trans->c; struct btree_update *as; u64 start_time = local_clock(); - int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; @@ -1057,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < c->journal.watermark) { struct journal_res res = { 0 }; @@ -1087,16 +1098,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[update_level].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) { + if (!down_read_trylock(&c->gc_lock)) { ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); if (ret) { up_read(&c->gc_lock); @@ -1110,7 +1119,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; INIT_LIST_HEAD(&as->list); @@ -1153,7 +1162,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * flag */ if (bch2_err_matches(ret, ENOSPC) && - (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; @@ -1183,6 +1192,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, return as; err: bch2_btree_update_free(as, trans); + if (!bch2_err_matches(ret, ENOSPC) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1214,7 +1226,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_and_count(c, btree_node_set_root, c, b); + trace_and_count(c, btree_node_set_root, trans, b); old = btree_node_root(c, b); @@ -1390,7 +1402,7 @@ static void __btree_split_node(struct btree_update *as, unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) n[i]->data->format = b->format; btree_node_set_format(n[i], n[i]->data->format); @@ -1445,10 +1457,12 @@ static void __btree_split_node(struct btree_update *as, */ static void btree_split_insert_keys(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys) { + struct btree_path *path = trans->paths + path_idx; + if (!bch2_keylist_empty(keys) && bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter; @@ -1462,25 +1476,25 @@ static void btree_split_insert_keys(struct btree_update *as, } static int btree_split(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(path, b); + struct btree *parent = btree_node_parent(trans->paths + path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; - struct btree_path *path1 = NULL, *path2 = NULL; + btree_path_idx_t path1 = 0, path2 = 0; u64 start_time = local_clock(); int ret = 0; BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, c, b); + trace_and_count(c, btree_node_split, trans, b); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1501,15 +1515,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n2); + mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n2); /* * Note that on recursive parent_keys == keys, so we @@ -1526,11 +1540,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n3); six_unlock_write(&n3->c.lock); - path2->locks_want++; - BUG_ON(btree_node_locked(path2, n3->c.level)); + trans->paths[path2].locks_want++; + BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n3); + mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n3); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1538,7 +1552,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); } } else { - trace_and_count(c, btree_node_compact, c, b); + trace_and_count(c, btree_node_compact, trans, b); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -1551,10 +1565,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1568,10 +1582,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; } else if (n3) { - bch2_btree_set_root(as, trans, path, n3); + bch2_btree_set_root(as, trans, trans->paths + path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, path, n1); + bch2_btree_set_root(as, trans, trans->paths + path, n1); } if (n3) { @@ -1591,13 +1605,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); if (n3) - bch2_trans_node_add(trans, n3); + bch2_trans_node_add(trans, trans->paths + path, n3); if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); + bch2_trans_node_add(trans, trans->paths + path2, n2); + bch2_trans_node_add(trans, trans->paths + path1, n1); if (n3) six_unlock_intent(&n3->c.lock); @@ -1606,11 +1620,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_intent(&n1->c.lock); out: if (path2) { - __bch2_btree_path_unlock(trans, path2); + __bch2_btree_path_unlock(trans, trans->paths + path2); bch2_path_put(trans, path2, true); } if (path1) { - __bch2_btree_path_unlock(trans, path1); + __bch2_btree_path_unlock(trans, trans->paths + path1); bch2_path_put(trans, path1, true); } @@ -1638,13 +1652,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct keylist *keys) { struct btree_path *linked; + unsigned i; __bch2_btree_insert_keys_interior(as, trans, path, b, path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); bch2_trans_verify_paths(trans); @@ -1655,7 +1670,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * * @as: btree_update object * @trans: btree_trans object - * @path: path that points to current node + * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert * @flags: transaction commit flags @@ -1667,10 +1682,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * for leaf nodes -- inserts into interior nodes have to be atomic. */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1688,7 +1704,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_prep_for_write(trans, path, b); - if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); goto split; } @@ -1723,19 +1739,22 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned flags) { - struct btree *b = path_l(path)->b; + /* btree_split & merge may both cause paths array to be reallocated */ + + struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(trans, path, path->level, + as = bch2_btree_update_start(trans, trans->paths + path, + trans->paths[path].level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1748,20 +1767,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans, bch2_btree_update_done(as, trans); - for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + for (l = trans->paths[path].level + 1; + btree_node_intent_locked(&trans->paths[path], l) && !ret; + l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } int __bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1769,13 +1789,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; + enum btree_id btree = trans->paths[path].btree_id; + btree_path_idx_t sib_path = 0, new_path = 0; u64 start_time = local_clock(); int ret = 0; - BUG_ON(!path->should_be_locked); - BUG_ON(!btree_node_locked(path, level)); + BUG_ON(!trans->paths[path].should_be_locked); + BUG_ON(!btree_node_locked(&trans->paths[path], level)); - b = path->l[level].b; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { @@ -1787,18 +1809,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + sib_path = bch2_path_get(trans, btree, sib_pos, U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - btree_path_set_should_be_locked(sib_path); + btree_path_set_should_be_locked(trans->paths + sib_path); - m = sib_path->l[level].b; + m = trans->paths[sib_path].l[level].b; - if (btree_node_parent(path, b) != - btree_node_parent(sib_path, m)) { + if (btree_node_parent(trans->paths + path, b) != + btree_node_parent(trans->paths + sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1851,14 +1873,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, false, - BTREE_INSERT_NOFAIL|flags); + parent = btree_node_parent(trans->paths + path, b); + as = bch2_btree_update_start(trans, trans->paths + path, level, false, + BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; - trace_and_count(c, btree_node_merge, c, b); + trace_and_count(c, btree_node_merge, trans, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1882,10 +1904,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1903,10 +1925,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, path, b); - bch2_btree_node_free_inmem(trans, sib_path, m); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); + bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + path, n); bch2_trans_verify_paths(trans); @@ -1934,16 +1956,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; + btree_path_idx_t new_path = 0; int ret; - flags |= BTREE_INSERT_NOFAIL; + flags |= BCH_TRANS_COMMIT_no_enospc; - parent = btree_node_parent(iter->path, b); - as = bch2_btree_update_start(trans, iter->path, b->c.level, - false, flags); + struct btree_path *path = btree_iter_path(trans, iter); + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, b->c.level, false, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -1958,27 +1980,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); - trace_and_count(c, btree_node_rewrite, c, b); + trace_and_count(c, btree_node_rewrite, trans, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, - &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, + parent, &as->parent_keys, flags); if (ret) goto err; } else { - bch2_btree_set_root(as, trans, iter->path, n); + bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); } bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, iter->path, b); + bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + iter->path, n); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as, trans); @@ -2047,8 +2069,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2071,7 +2092,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { mutex_lock(&c->pending_node_rewrites_lock); list_add(&a->list, &c->pending_node_rewrites); mutex_unlock(&c->pending_node_rewrites_lock); @@ -2079,15 +2100,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) } if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_STARTED, &c->flags)) { + if (test_bit(BCH_FS_started, &c->flags)) { bch_err(c, "%s: error getting c->writes ref", __func__); kfree(a); return; } ret = bch2_fs_read_write_early(c); + bch_err_msg(c, ret, "going read-write"); if (ret) { - bch_err_msg(c, ret, "going read-write"); kfree(a); return; } @@ -2138,13 +2159,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, int ret; if (!skip_triggers) { - ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), 0); - if (ret) - return ret; - - ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, - new_key, 0); + ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), + BTREE_TRIGGER_TRANSACTIONAL) ?: + bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s(new_key), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -2156,7 +2176,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(ret); } - parent = btree_node_parent(iter->path, b); + parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { bch2_trans_copy_iter(&iter2, iter); @@ -2164,10 +2184,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, iter2.flags & BTREE_ITER_INTENT, _THIS_IP_); - BUG_ON(iter2.path->level != b->c.level); - BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + struct btree_path *path2 = btree_iter_path(trans, &iter2); + BUG_ON(path2->level != b->c.level); + BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - btree_path_set_level_up(trans, iter2.path); + btree_path_set_level_up(trans, path2); trans->paths_sorted = false; @@ -2178,23 +2199,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - ret = darray_make_room(&trans->extra_journal_entries, + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + journal_entry_set(e, BCH_JSET_ENTRY_btree_root, b->c.btree_id, b->c.level, new_key, new_key->k.u64s); - trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); if (ret) goto err; - bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2209,7 +2230,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); } - bch2_btree_node_unlock_write(trans, iter->path, b); + bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); out: bch2_trans_iter_exit(trans, &iter2); return ret; @@ -2228,7 +2249,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; - struct btree_path *path = iter->path; + struct btree_path *path = btree_iter_path(trans, iter); struct closure cl; int ret = 0; @@ -2243,7 +2264,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); if (ret) { ret = drop_locks_do(trans, (closure_sync(&cl), 0)); if (ret) @@ -2267,7 +2288,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite six_unlock_intent(&new_hash->c.lock); } closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -2286,7 +2307,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, goto out; /* has node been freed? */ - if (iter.path->l[b->c.level].b != b) { + if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -2328,12 +2349,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); set_btree_node_fake(b); set_btree_node_need_rewrite(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index a6668992a2..c593c925d1 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); -int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, +int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned level, unsigned flags, enum btree_node_sibling sib) { + struct btree_path *path = trans->paths + path_idx; struct btree *b; EBUG_ON(!btree_node_locked(path, level)); @@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); } static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags) { @@ -183,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b) b->sib_u64s[1] = b->nr.live_u64s; } -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +static inline void *btree_data_end(struct btree *b) { - return (void *) b->data + btree_bytes(c); + return (void *) b->data + btree_buf_bytes(b); } -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) { - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); + return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); } -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) { - return btree_data_end(c, b); + return btree_data_end(b); } static inline void *write_block(struct btree *b) @@ -220,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k) return __btree_addr_written(b, k); } -static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, - struct btree *b, - void *end) +static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size >> 3; + ssize_t total = btree_buf_bytes(b) >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; @@ -234,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, return total - used; } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) +static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) { - ssize_t remaining = __bch_btree_u64s_remaining(c, b, + ssize_t remaining = __bch2_btree_u64s_remaining(b, btree_bkey_last(b, bset_tree_last(b))); BUG_ON(remaining < 0); @@ -259,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b) return 8 << BTREE_WRITE_SET_U64s_BITS; } -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, bne->keys.start); + __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -280,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) +static inline void push_whiteout(struct btree *b, struct bpos pos) { struct bkey_packed k; - BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { @@ -298,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(b), &k); } /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) +static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) { if (unlikely(btree_node_need_rewrite(b))) return false; - return u64s <= bch_btree_keys_u64s_remaining(c, b); + return u64s <= bch2_btree_keys_u64s_remaining(b); } void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 4e6241db51..ac78448619 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,45 +7,143 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" -#include <linux/sort.h> +#include <linux/prefetch.h> -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); + +static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + return (cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo)) >= 0; +} - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); +static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ +#ifdef CONFIG_X86_64 + int cmp; + + asm("mov (%[l]), %%rax;" + "sub (%[r]), %%rax;" + "mov 8(%[l]), %%rax;" + "sbb 8(%[r]), %%rax;" + "mov 16(%[l]), %%rax;" + "sbb 16(%[r]), %%rax;" + : "=@ccae" (cmp) + : [l] "r" (l), [r] "r" (r) + : "rax", "cc"); + + EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); + return cmp; +#else + return __wb_key_ref_cmp(l, r); +#endif } -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +/* Compare excluding idx, the low 24 bits: */ +static inline bool wb_key_eq(const void *_l, const void *_r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; - return cmp_int(l->journal_seq, r->journal_seq); + return !((l->hi ^ r->hi)| + (l->mi ^ r->mi)| + ((l->lo >> 24) ^ (r->lo >> 24))); } -static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb, - unsigned commit_flags, - bool *write_locked, - size_t *fast) +static noinline void wb_sort(struct wb_key_ref *base, size_t num) +{ + size_t n = num, a = num / 2; + + if (!a) /* num < 2 || size == 0 */ + return; + + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + --a; + else if (--n) /* Sorting: Extract root to --n */ + swap(base[0], base[n]); + else /* Sort complete */ + break; + + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + 1, (d = c + 1) < n;) + b = wb_key_ref_cmp(base + c, base + d) ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && wb_key_ref_cmp(base + a, base + b)) + b = (b - 1) / 2; + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = (b - 1) / 2; + swap(base[b], base[c]); + } + } +} + +static noinline int wb_flush_one_slowpath(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); +} + +static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, + struct btree_write_buffered_key *wb, + bool *write_locked, size_t *fast) { - struct bch_fs *c = trans->c; struct btree_path *path; int ret; + EBUG_ON(!wb->journal_seq); + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - path = iter->path; + /* + * We can't clone a path that has write locks: unshare it now, before + * set_pos and traverse(): + */ + if (btree_iter_path(trans, iter)->ref > 1) + iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); + + path = btree_iter_path(trans, iter); if (!*write_locked) { ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); @@ -56,52 +154,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, *write_locked = true; } - if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); + if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; - goto trans_commit; + return wb_flush_one_slowpath(trans, iter, wb); } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; - - if (path->ref > 1) { - /* - * We can't clone a path that has write locks: if the path is - * shared, unlock before set_pos(), traverse(): - */ - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - *write_locked = false; - } return 0; -trans_commit: - return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM); -} - -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; } /* @@ -124,41 +184,87 @@ btree_write_buffered_insert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_CACHED|BTREE_ITER_INTENT); + trans->journal_res.seq = wb->journal_seq; + ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); return ret; } -int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, - bool locked) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); - - if (!locked && !mutex_trylock(&wb->flush_lock)) - return 0; - - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); - bch2_journal_pin_drop(j, &wb->journal_pin); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - if (race_fault()) - goto slowpath; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -168,208 +274,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't - * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + wb_sort(wb->sorted.data, wb->sorted.nr); + + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && + wb_key_eq(i, i + 1)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); + k->journal_seq = 0; continue; } - if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); - write_locked = false; + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + + if (path->btree_id != i->btree || + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + write_locked = false; + } } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); - iter.path->preserve = false; + bch2_btree_iter_set_pos(&iter, k->k.k.p); + btree_iter_path(trans, &iter)->preserve = false; do { - ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, - commit_flags, &write_locked, &fast); + if (race_fault()) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + if (!ret) { + k->journal_seq = 0; + } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; - continue; - } - if (ret) + ret = 0; + } else break; - - i->journal_seq = 0; } - if (write_locked) - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + } bch2_trans_iter_exit(trans, &iter); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - - if (slowpath) - goto slowpath; + if (ret) + goto err; + if (slowpath) { + /* + * Flush in the order they were present in the journal, so that + * we can release journal pins: + * The fastpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + + darray_for_each(wb->flushing.keys, i) { + if (!i->journal_seq) + continue; + + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + bch2_trans_begin(trans); + + ret = commit_do(trans, NULL, NULL, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); + if (ret) + goto err; + } + } +err: bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); -out: - bch2_journal_pin_drop(j, &pin); - mutex_unlock(&wb->flush_lock); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; return ret; -slowpath: - trace_write_buffer_flush_slowpath(trans, i - keys, nr); +} - /* - * Now sort the rest by journal seq and bump the journal pin as we go. - * The slowpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; - commit_flags &= ~BCH_WATERMARK_MASK; - commit_flags |= BCH_WATERMARK_reclaim; + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); + mutex_unlock(&j->buf_lock); + } - for (i = keys; i < keys + nr; i++) { - if (!i->journal_seq) - continue; + return ret; +} - if (i->journal_seq > pin.seq) { - struct journal_entry_pin pin2; +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; - memset(&pin2, 0, sizeof(pin2)); + do { + bch2_trans_unlock(trans); - bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); - bch2_journal_pin_drop(j, &pin); - bch2_journal_pin_copy(j, &pin, &pin2, NULL); - bch2_journal_pin_drop(j, &pin2); - } + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); - ret = commit_do(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM, - btree_write_buffered_insert(trans, i)); - if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) - break; - } + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); - goto out; + return ret; } -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) { - bch2_trans_unlock(trans); - mutex_lock(&trans->c->btree_write_buffer.flush_lock); - return __bch2_btree_write_buffer_flush(trans, 0, true); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); } -int bch2_btree_write_buffer_flush(struct btree_trans *trans) +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { - return __bch2_btree_write_buffer_flush(trans, 0, false); + struct bch_fs *c = trans->c; + + trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); } -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0; - mutex_lock(&wb->flush_lock); + if (mutex_trylock(&wb->flushing.lock)) { + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } - return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); + return ret; } -static inline u64 btree_write_buffer_ref(int idx) +int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + return -BCH_ERR_erofs_no_writes; + + int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + return ret; } -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - struct bch_fs *c = trans->c; + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; + int ret; + + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); + + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); +} - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); +int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; + return ret; } - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ + + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); + return ret; +} + +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); return ret; } +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 322df1c830..eebcd2b152 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -2,12 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -int bch2_btree_write_buffer_flush(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +int bch2_btree_write_buffer_tryflush(struct btree_trans *); + +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; + +int bch2_journal_key_to_wb_slowpath(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 99993ba77a..9b9433de9c 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -union btree_write_buffer_state { +struct wb_key_ref { +union { struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + u8 pos[sizeof(struct bpos)]; + enum btree_id btree:8; +#else + enum btree_id btree:8; + u8 pos[sizeof(struct bpos)]; + unsigned idx:24; +#endif + } __packed; struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif }; }; +}; -struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; +struct btree_write_buffered_key { + enum btree_id btree:8; + u64 journal_seq:56; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; - union btree_write_buffer_state state; - size_t size; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; +}; - struct btree_write_buffered_key *keys[2]; +struct btree_write_buffer { + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5a91d3189f..54f7826ac4 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -25,7 +25,7 @@ #include <linux/preempt.h> -static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { @@ -47,31 +47,27 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, void bch2_fs_usage_initialize(struct bch_fs *c) { - struct bch_fs_usage *usage; - struct bch_dev *ca; - unsigned i; - percpu_down_write(&c->mark_lock); - usage = c->usage_base; + struct bch_fs_usage *usage = c->usage_base; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for (i = 0; i < BCH_REPLICAS_MAX; i++) - usage->reserved += usage->persistent_reserved[i]; + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) + usage->b.reserved += usage->persistent_reserved[i]; - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * + usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * ca->mi.bucket_size; } @@ -158,8 +154,7 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - struct bch_dev *ca; - unsigned i, u64s = fs_usage_u64s(c); + unsigned u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { + for_each_member_device_rcu(c, ca, NULL) { u64s = dev_usage_u64s(); acc_u64s_percpu((u64 *) ca->usage_base, @@ -193,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out, prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.hidden); + fs_usage->u.b.hidden); prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.data); + fs_usage->u.b.data); prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.cached); + fs_usage->u.b.cached); prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.reserved); + fs_usage->u.b.reserved); prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.nr_inodes); + fs_usage->u.b.nr_inodes); prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); @@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); prt_printf(out, "\t"); @@ -230,10 +225,10 @@ static u64 reserve_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->u.hidden + - fs_usage->u.btree + - fs_usage->u.data + - reserve_factor(fs_usage->u.reserved + + return min(fs_usage->u.b.hidden + + fs_usage->u.b.btree + + fs_usage->u.b.data + + reserve_factor(fs_usage->u.b.reserved + fs_usage->online_reserved), c->capacity); } @@ -245,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->data) + - bch2_fs_usage_read_one(c, &c->usage_base->btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + + bch2_fs_usage_read_one(c, &c->usage_base->b.btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); return ret; } @@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca) ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; } -static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) +void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - return a.dirty_sectors - ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) - : 0; + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < BCH_DATA_NR; i++) { + bch2_prt_data_type(out, i); + prt_tab(out); + prt_u64(out, usage->d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } } -static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_alloc_v4 old, - struct bch_alloc_v4 new, - u64 journal_seq, bool gc) +void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + const struct bch_alloc_v4 *old, + const struct bch_alloc_v4 *new, + u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; @@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - if (data_type_is_hidden(old.data_type)) - fs_usage->hidden -= ca->mi.bucket_size; - if (data_type_is_hidden(new.data_type)) - fs_usage->hidden += ca->mi.bucket_size; + if (data_type_is_hidden(old->data_type)) + fs_usage->b.hidden -= ca->mi.bucket_size; + if (data_type_is_hidden(new->data_type)) + fs_usage->b.hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); - u->d[old.data_type].buckets--; - u->d[new.data_type].buckets++; - - u->buckets_ec -= (int) !!old.stripe; - u->buckets_ec += (int) !!new.stripe; + u->d[old->data_type].buckets--; + u->d[new->data_type].buckets++; - u->d[old.data_type].sectors -= old.dirty_sectors; - u->d[new.data_type].sectors += new.dirty_sectors; + u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old); + u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new); - u->d[BCH_DATA_cached].sectors += new.cached_sectors; - u->d[BCH_DATA_cached].sectors -= old.cached_sectors; + u->d[BCH_DATA_cached].sectors += new->cached_sectors; + u->d[BCH_DATA_cached].sectors -= old->cached_sectors; - u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); - u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old); + u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new); preempt_enable(); } -static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket old, struct bucket new, - u64 journal_seq, bool gc) +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { - struct bch_alloc_v4 old_a = { - .gen = old.gen, - .data_type = old.data_type, - .dirty_sectors = old.dirty_sectors, - .cached_sectors = old.cached_sectors, - .stripe = old.stripe, - }; - struct bch_alloc_v4 new_a = { - .gen = new.gen, - .data_type = new.data_type, - .dirty_sectors = new.dirty_sectors, - .cached_sectors = new.cached_sectors, - .stripe = new.stripe, + return (struct bch_alloc_v4) { + .gen = b.gen, + .data_type = b.data_type, + .dirty_sectors = b.dirty_sectors, + .cached_sectors = b.cached_sectors, + .stripe = b.stripe, }; +} - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); +void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket *old, struct bucket *new) +{ + struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); + struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); + + bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); } static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); @@ -353,14 +359,14 @@ static inline int __update_replicas(struct bch_fs *c, if (idx < 0) return -1; - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } -static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry *r, s64 sectors, - unsigned journal_seq, bool gc) +int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry_v1 *r, s64 sectors, + unsigned journal_seq, bool gc) { struct bch_fs_usage *fs_usage; int idx, ret = 0; @@ -388,7 +394,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); err: @@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, k, &r.e, sectors, journal_seq, gc); + return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc); } static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, @@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) __replicas_deltas_realloc(trans, more, _gfp)); } -static inline int update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry *r, - s64 sectors) +int bch2_update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry_v1 *r, + s64 sectors) { struct replicas_delta_list *d; struct replicas_delta *n; @@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans, return 0; } -static inline int update_cached_sectors_list(struct btree_trans *trans, - unsigned dev, s64 sectors) +int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - return update_replicas_list(trans, &r.e, sectors); -} - -int bch2_mark_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq; - struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, new_a_convert; - const struct bch_alloc_v4 *old_a, *new_a; - struct bch_dev *ca; - int ret = 0; - - /* - * alloc btree is read in by bch2_alloc_read, not gc: - */ - if ((flags & BTREE_TRIGGER_GC) && - !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) - return 0; - - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) - return -EIO; - - ca = bch_dev_bkey_exists(c, new.k->p.inode); - - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = bch2_alloc_to_v4(new, &new_a_convert); - - bucket_journal_seq = new_a->journal_seq; - - if ((flags & BTREE_TRIGGER_INSERT) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; - - EBUG_ON(!journal_seq); - - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; - } - - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (ret) { - bch2_fs_fatal_error(c, - "error setting bucket_needs_journal_commit: %i", ret); - return ret; - } - } - - percpu_down_read(&c->mark_lock); - if (!gc && new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; - - bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); - - if (gc) { - struct bucket *g = gc_bucket(ca, new.k->p.offset); - - bucket_lock(g); - - g->gen_valid = 1; - g->gen = new_a->gen; - g->data_type = new_a->data_type; - g->stripe = new_a->stripe; - g->stripe_redundancy = new_a->stripe_redundancy; - g->dirty_sectors = new_a->dirty_sectors; - g->cached_sectors = new_a->cached_sectors; - - bucket_unlock(g); - } - percpu_up_read(&c->mark_lock); - - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_a->cached_sectors) { - ret = update_cached_sectors(c, new, ca->dev_idx, - -((s64) old_a->cached_sectors), - journal_seq, gc); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) - closure_wake_up(&c->freelist_wait); - - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); - - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); - - if (new_a->data_type == BCH_DATA_need_gc_gens) - bch2_do_gc_gens(c); - - return 0; + return bch2_update_replicas_list(trans, &r.e, sectors); } int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -643,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", - bch2_data_types[g->data_type], - bch2_data_types[data_type])) { + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { ret = -EIO; goto err; } @@ -652,37 +532,33 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, - bch2_data_types[g->data_type ?: data_type], + bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) { ret = -EIO; goto err; } - g->data_type = data_type; g->dirty_sectors += sectors; new = *g; err: bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); return ret; } -static int check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 dirty_sectors, u32 cached_sectors) +int bch2_check_bucket_ref(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 bucket_sectors) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - u32 bucket_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; struct printbuf buf = PRINTBUF; int ret = 0; @@ -699,7 +575,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -712,7 +588,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -727,7 +603,7 @@ static int check_bucket_ref(struct btree_trans *trans, "while marking %s", ptr->dev, bucket_nr, b_gen, *bucket_gen(ca, bucket_nr), - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -748,8 +624,8 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type], - bch2_data_types[ptr_data_type], + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -762,7 +638,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -777,508 +653,6 @@ err: goto out; } -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - u64 journal_seq = trans->journal_res.seq; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - /* * XXX doesn't handle deletion */ - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); - - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; - goto err; - } - - bucket_lock(g); - old = *g; - - ret = check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors, g->cached_sectors); - if (ret) - goto err; - - g->data_type = data_type; - g->dirty_sectors += sectors; - - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; -err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; -} - -static int __mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) -{ - u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, - *dirty_sectors, *cached_sectors); - - if (ret) - return ret; - - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - - return 0; -} - -static int bch2_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct extent_ptr_decoded p, - s64 sectors, - unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket old, new, *g; - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); - u8 bucket_data_type; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, &p.ptr); - bucket_lock(g); - old = *g; - - bucket_data_type = g->data_type; - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (!ret) - g->data_type = bucket_data_type; - - new = *g; - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - - return ret; -} - -static int bch2_mark_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_stripe_ptr p, - enum bch_data_type data_type, - s64 sectors, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_replicas_padded r; - struct gc_stripe *m; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; - } - - mutex_lock(&c->ec_stripes_heap_lock); - - if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); - bch2_inconsistent_error(c); - return -EIO; - } - - m->block_sectors[p.block] += sectors; - - r = m->r; - mutex_unlock(&c->ec_stripes_heap_lock); - - r.e.data_type = data_type; - update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); - - return 0; -} - -static int __mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_replicas_padded r; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; - s64 dirty_sectors = 0; - bool stale; - int ret; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - r.e.data_type = data_type; - r.e.nr_devs = 0; - r.e.nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); - if (ret < 0) - return ret; - - stale = ret > 0; - - if (p.ptr.cached) { - if (!stale) { - ret = update_cached_sectors(c, k, p.ptr.dev, - disk_sectors, journal_seq, true); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - } else if (!p.has_ec) { - dirty_sectors += disk_sectors; - r.e.devs[r.e.nr_devs++] = p.ptr.dev; - } else { - ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, - disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - r.e.nr_required = 0; - } - } - - if (r.e.nr_devs) { - ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -int bch2_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); -} - -int bch2_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - unsigned i; - int ret; - - BUG_ON(gc && old_s); - - if (!gc) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } else { - struct gc_stripe *m = - genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - idx); - return -BCH_ERR_ENOMEM_mark_stripe; - } - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - */ - m->alive = true; - m->sectors = le16_to_cpu(new_s->sectors); - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - - for (i = 0; i < new_s->nr_blocks; i++) - m->ptrs[i] = new_s->ptrs[i]; - - bch2_bkey_to_replicas(&m->r.e, new); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(m->block_sectors, 0, sizeof(m->block_sectors)); - - for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } - - ret = update_replicas(c, new, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - journal_seq, gc); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -static int __mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; - sectors *= replicas; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - - fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(fs_usage->persistent_reserved)); - - fs_usage->reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - - return 0; -} - -int bch2_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); -} - -static s64 __bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 start, u64 end, - u64 *idx, unsigned flags, size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - u64 next_idx = end; - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (fsck_err(c, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i_error *new; - - new = bch2_trans_kmalloc(trans, sizeof(*new)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_init(&new->k); - new->k.type = KEY_TYPE_error; - new->k.p = bkey_start_pos(p.k); - new->k.p.offset += *idx - start; - bch2_key_resize(&new->k, next_idx - *idx); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, - BTREE_TRIGGER_NORUN); - } - - *idx = next_idx; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int __mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - struct reflink_gc *ref; - size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx), start = idx; - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { - idx -= le32_to_cpu(p.v->front_pad); - end += le32_to_cpu(p.v->back_pad); - } - - l = 0; - r = c->reflink_gc_nr; - while (l < r) { - m = l + (r - l) / 2; - - ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = __bch2_mark_reflink_p(trans, p, start, end, - &idx, flags, l++); - - return ret; -} - -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); -} - void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1303,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); } - dst->nr_inodes -= deltas->nr_inodes; + dst->b.nr_inodes -= deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added -= deltas->persistent_reserved[i]; - dst->reserved -= deltas->persistent_reserved[i]; + dst->b.reserved -= deltas->persistent_reserved[i]; dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } @@ -1320,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - s64 added = 0, should_not_have_added; - unsigned i; percpu_down_read(&c->mark_lock); preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); + struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; + struct bch_fs_usage_base *src = &trans->fs_usage_delta; - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } - - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - } - - dst->nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added += deltas->persistent_reserved[i]; - dst->reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } + s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - should_not_have_added = added - (s64) disk_res_sectors; + s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new, v = atomic64_read(&c->sectors_available); @@ -1380,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, this_cpu_sub(*c->online_reserved, added); } + dst->hidden += src->hidden; + dst->btree += src->btree; + dst->data += src->data; + dst->cached += src->cached; + dst->reserved += src->reserved; + dst->nr_inodes += src->nr_inodes; + preempt_enable(); percpu_up_read(&c->mark_lock); @@ -1387,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + + dst->b.nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + dst->b.reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; need_mark: /* revert changes: */ @@ -1398,92 +784,184 @@ need_mark: return -1; } -/* trans_mark: */ +/* KEY_TYPE_extent: */ + +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u32 *dirty_sectors, u32 *cached_sectors) +{ + u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, *dst_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + + return 0; +} -static inline int bch2_trans_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - unsigned flags) +static int bch2_trigger_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 *sectors, + unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; struct bpos bucket; struct bch_backpointer bp; - s64 sectors; - int ret; bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); - sectors = bp.bucket_len; - if (!insert) - sectors = -sectors; + *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - a = bch2_trans_start_alloc_update(trans, &iter, bucket); - if (IS_ERR(a)) - return PTR_ERR(a); + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); + ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); - if (ret) - return ret; - - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); if (ret) return ret; + + if (!p.ptr.cached) { + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + if (ret) + return ret; + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + + percpu_down_read(&c->mark_lock); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + bucket_lock(g); + struct bucket old = *g; + + u8 bucket_data_type = g->data_type; + int ret = __mark_pointer(trans, k, &p.ptr, *sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (ret) { + bucket_unlock(g); + percpu_up_read(&c->mark_lock); + return ret; + } + + g->data_type = bucket_data_type; + struct bucket new = *g; + bucket_unlock(g); + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); } return 0; } -static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, - struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) +static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + enum bch_data_type data_type, + s64 sectors, unsigned flags) { - struct btree_iter iter; - struct bkey_i_stripe *s; - struct bch_replicas_padded r; - int ret = 0; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); + goto err; + } - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); - ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; + goto err; + } - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = -EIO; - goto err; + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + ret = bch2_update_replicas_list(trans, &r.e, sectors); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); - r.e.data_type = data_type; - ret = update_replicas_list(trans, &r.e, sectors); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.ec.idx); + return -BCH_ERR_ENOMEM_mark_stripe_ptr; + } + + mutex_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + mutex_unlock(&c->ec_stripes_heap_lock); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", + (u64) p.ec.idx, buf.buf); + printbuf_exit(&buf); + bch2_inconsistent_error(c); + return -EIO; + } + + m->block_sectors[p.ec.block] += sectors; + + struct bch_replicas_padded r = m->r; + mutex_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + } + + return 0; } -static int __trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +static int __trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + bool gc = flags & BTREE_TRIGGER_GC; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1492,11 +970,7 @@ static int __trans_mark_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; s64 dirty_sectors = 0; - bool stale; int ret = 0; r.e.data_type = data_type; @@ -1504,21 +978,20 @@ static int __trans_mark_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); + s64 disk_sectors; + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); if (ret < 0) return ret; - stale = ret > 0; + bool stale = ret > 0; if (p.ptr.cached) { if (!stale) { - ret = update_cached_sectors_list(trans, p.ptr.dev, - disk_sectors); + ret = !gc + ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) + : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); + bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors", + __func__); if (ret) return ret; } @@ -1526,324 +999,122 @@ static int __trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_trans_mark_stripe_ptr(trans, p, - disk_sectors, data_type); + ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) return ret; + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ r.e.nr_required = 0; } } - if (r.e.nr_devs) - ret = update_replicas_list(trans, &r.e, dirty_sectors); - - return ret; -} - -int bch2_trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - - (int) bch2_bkey_needs_rebalance(c, old); + if (r.e.nr_devs) { + ret = !gc + ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) + : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + if (unlikely(ret && gc)) { + struct printbuf buf = PRINTBUF; - if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + } if (ret) return ret; } - return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); -} - -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; - int ret = 0; - - if (deleting) - sectors = -sectors; - - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors, a->v.cached_sectors); - if (ret) - goto err; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { - ret = -EIO; - goto err; - } - - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - s.k->p.offset)) { - ret = -EIO; - goto err; - } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; - } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { - ret = -EIO; - goto err; - } - - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); - } - - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -int bch2_trans_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bch_stripe *old_s = NULL; - struct bch_stripe *new_s = NULL; - struct bch_replicas_padded r; - unsigned i, nr_blocks; - int ret = 0; - - if (old.k->type == KEY_TYPE_stripe) - old_s = bkey_s_c_to_stripe(old).v; - if (new->k.type == KEY_TYPE_stripe) - new_s = &bkey_i_to_stripe(new)->v; - - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); + struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); + unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; + unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + + /* if pointers aren't changing - nothing to do: */ + if (new_ptrs_bytes == old_ptrs_bytes && + !memcmp(new_ptrs.start, + old_ptrs.start, + new_ptrs_bytes)) return 0; - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - + (int) bch2_bkey_needs_rebalance(c, old); - nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - if (new_s) { - s64 sectors = le16_to_cpu(new_s->sectors); - - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); - ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); - if (ret) - return ret; - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); - - bch2_bkey_to_replicas(&r.e, old); - ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); - if (ret) - return ret; - } - - for (i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_i_to_s_c_stripe(new), i, false); - if (ret) - break; - } - - if (old_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); if (ret) - break; + return ret; } } - return ret; + if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + + return 0; } -static int __trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +/* KEY_TYPE_reservation */ + +static int __trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - struct replicas_delta_list *d; - int ret; + s64 sectors = (s64) k.k->size * replicas; if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; - sectors *= replicas; - - ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; - d = trans->fs_usage_deltas; - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->persistent_reserved)); - - d->persistent_reserved[replicas - 1] += sectors; - return 0; -} - -int bch2_trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); -} - -static int trans_mark_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - struct printbuf buf = PRINTBUF; - int ret; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; + struct replicas_delta_list *d = trans->fs_usage_deltas; + replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); - refcount = bkey_refcount(k); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + d->persistent_reserved[replicas - 1] += sectors; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); - pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); + replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); + fs_usage->b.reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; - pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); + preempt_enable(); + percpu_up_read(&c->mark_lock); } - le64_add_cpu(refcount, add); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); - if (ret) - goto err; - - *idx = k->k.p.offset; -err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; + return 0; } -static int __trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +int bch2_trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx, end_idx; - int ret = 0; - - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - end_idx = le64_to_cpu(p.v->idx) + p.k->size + - le32_to_cpu(p.v->back_pad); - - while (idx < end_idx && !ret) - ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); - return ret; + return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); -} +/* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, @@ -1871,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - bch2_data_types[type], - bch2_data_types[type]); + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); ret = -EIO; goto err; } @@ -1974,17 +1245,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_trans_mark_dev_sbs(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { int ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { percpu_ref_put(&ca->ref); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 21f6cb3569..6387e039f7 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) } void bch2_dev_usage_init(struct bch_dev *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); +void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, + const struct bch_alloc_v4 *, + const struct bch_alloc_v4 *, u64, bool); +void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, + struct bucket *, struct bucket *); + /* key/bucket marking: */ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, @@ -315,43 +322,41 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, : c->usage[journal_seq & JOURNAL_BUF_MASK]); } +int bch2_update_replicas(struct bch_fs *, struct bkey_s_c, + struct bch_replicas_entry_v1 *, s64, + unsigned, bool); +int bch2_update_replicas_list(struct btree_trans *, + struct bch_replicas_entry_v1 *, s64); +int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); +int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, + const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32); + int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); - -int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); - -#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); +int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ ret; \ }) -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ - mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) +void bch2_trans_account_disk_usage_change(struct btree_trans *); void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); @@ -382,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) return false; } +static inline const char *bch2_data_type_str(enum bch_data_type type) +{ + return type < BCH_DATA_NR + ? __bch2_data_types[type] + : "(invalid data type)"; +} + +static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) +{ + if (type < BCH_DATA_NR) + prt_str(out, __bch2_data_types[type]); + else + prt_printf(out, "(invalid data type %u)", type); +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 2a9dab9006..6a31740222 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -33,8 +33,6 @@ struct bucket_gens { }; struct bch_dev_usage { - u64 buckets_ec; - struct { u64 buckets; u64 sectors; /* _compressed_ sectors: */ @@ -47,23 +45,18 @@ struct bch_dev_usage { } d[BCH_DATA_NR]; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ +struct bch_fs_usage_base { u64 hidden; u64 btree; u64 data; u64 cached; u64 reserved; u64 nr_inodes; +}; - /* XXX: add stats for compression ratio */ -#if 0 - u64 uncompressed; - u64 compressed; -#endif - - /* broken out: */ - +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + struct bch_fs_usage_base b; u64 persistent_reserved[BCH_REPLICAS_MAX]; u64 replicas[]; }; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 64000c8da5..226b39c176 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,22 +7,27 @@ #include "chardev.h" #include "journal.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" +#include "thread_with_file.h" -#include <linux/anon_inodes.h> #include <linux/cdev.h> #include <linux/device.h> -#include <linux/file.h> #include <linux/fs.h> #include <linux/ioctl.h> -#include <linux/kthread.h> #include <linux/major.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/uaccess.h> +__must_check +static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + char **devs; + size_t nr_devs; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + if (thr->devs) + for (size_t i = 0; i < thr->nr_devs; i++) + kfree(thr->devs[i]); + kfree(thr->devs); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); + + thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); + if (!thr->thr.thr.ret) + bch2_fs_stop(c); + + thread_with_stdio_done(&thr->thr); + return 0; +} + +static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + u64 *devs = NULL; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || + !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || + !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + thr->nr_devs = arg.nr_devs; + + if (copy_from_user(devs, &user_arg->devs[0], + array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { + ret = -EINVAL; + goto err; + } + + for (size_t i = 0; i < arg.nr_devs; i++) { + thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); + ret = PTR_ERR_OR_ZERO(thr->devs[i]); + if (ret) + goto err; + } + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_offline_thread_fn); +err: + if (ret < 0) { + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + } + kfree(devs); + return ret; +} + static long bch2_global_ioctl(unsigned cmd, void __user *arg) { + long ret; + switch (cmd) { #if 0 case BCH_IOCTL_ASSEMBLE: @@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) case BCH_IOCTL_INCREMENTAL: return bch2_ioctl_incremental(arg); #endif + case BCH_IOCTL_FSCK_OFFLINE: { + ret = bch2_ioctl_fsck_offline(arg); + break; + } default: - return -ENOTTY; + ret = -ENOTTY; + break; } + + if (ret < 0) + ret = bch2_err_class(ret); + return ret; } static long bch2_ioctl_query_uuid(struct bch_fs *c, struct bch_ioctl_query_uuid __user *user_arg) { - if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid))) - return -EFAULT; - return 0; + return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); } #if 0 @@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, } struct bch_data_ctx { + struct thread_with_file thr; + struct bch_fs *c; struct bch_ioctl_data arg; struct bch_move_stats stats; - - int ret; - - struct task_struct *thread; }; static int bch2_data_thread(void *arg) { - struct bch_data_ctx *ctx = arg; - - ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ctx->stats.data_type = U8_MAX; return 0; } static int bch2_data_job_release(struct inode *inode, struct file *file) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - kthread_stop(ctx->thread); - put_task_struct(ctx->thread); + bch2_thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) static ssize_t bch2_data_job_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, @@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (len < sizeof(e)) return -EINVAL; - if (copy_to_user(buf, &e, sizeof(e))) - return -EFAULT; - - return sizeof(e); + return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = { static long bch2_ioctl_data(struct bch_fs *c, struct bch_ioctl_data arg) { - struct bch_data_ctx *ctx = NULL; - struct file *file = NULL; - unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; - int ret, fd = -1; + struct bch_data_ctx *ctx; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -374,35 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, - "bch-data/%s", c->name); - if (IS_ERR(ctx->thread)) { - ret = PTR_ERR(ctx->thread); - goto err; - } - - ret = get_unused_fd_flags(flags); + ret = bch2_run_thread_with_file(&ctx->thr, + &bcachefs_data_ops, + bch2_data_thread); if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - get_task_struct(ctx->thread); - wake_up_process(ctx->thread); - fd_install(fd, file); - - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (!IS_ERR_OR_NULL(ctx->thread)) - kthread_stop(ctx->thread); - kfree(ctx); + kfree(ctx); return ret; } @@ -416,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, unsigned i; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) @@ -443,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, dst_end = (void *) arg->replicas + replica_entries_bytes; for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *src_e = + struct bch_replicas_entry_v1 *src_e = cpu_replicas_entry(&c->replicas, i); /* check that we have enough space for one replicas entry */ @@ -473,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (ret) goto err; - if (copy_to_user(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes)) - ret = -EFAULT; + + ret = copy_to_user_errcode(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes); err: kfree(arg); return ret; } +/* obsolete, didn't allow for new data types: */ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { @@ -489,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_dev *ca; unsigned i; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -510,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - arg.buckets_ec = src.buckets_ec; for (i = 0; i < BCH_DATA_NR; i++) { arg.d[i].buckets = src.d[i].buckets; @@ -520,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, percpu_ref_put(&ca->ref); - if (copy_to_user(user_arg, &arg, sizeof(arg))) + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +} + +static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + struct bch_ioctl_dev_usage_v2 __user *user_arg) +{ + struct bch_ioctl_dev_usage_v2 arg; + struct bch_dev_usage src; + struct bch_dev *ca; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; - return 0; + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + if (ret) + goto err; + + for (unsigned i = 0; i < arg.nr_data_types; i++) { + struct bch_ioctl_dev_usage_type t = { + .buckets = src.d[i].buckets, + .sectors = src.d[i].sectors, + .fragmented = src.d[i].fragmented, + }; + + ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); + if (ret) + goto err; + } +err: + percpu_ref_put(&ca->ref); + return ret; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -560,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, goto err; } - if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb))) - ret = -EFAULT; + ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb)); err: if (!IS_ERR_OR_NULL(ca)) percpu_ref_put(&ca->ref); @@ -574,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, struct bch_ioctl_disk_get_idx arg) { dev_t dev = huge_decode_dev(arg.dev); - struct bch_dev *ca; - unsigned i; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -583,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (ca->dev == dev) { percpu_ref_put(&ca->io_ref); - return i; + return ca->dev_idx; } return -BCH_ERR_ENOENT_dev_idx_not_found; @@ -641,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } +static int bch2_fsck_online_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + thread_with_stdio_done(&thr->thr); + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return 0; +} + +static long bch2_ioctl_fsck_online(struct bch_fs *c, + struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_online_thread_fn); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -662,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) return bch2_ioctl_fs_usage(c, arg); case BCH_IOCTL_DEV_USAGE: return bch2_ioctl_dev_usage(c, arg); + case BCH_IOCTL_DEV_USAGE_V2: + return bch2_ioctl_dev_usage_v2(c, arg); #if 0 case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); @@ -674,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); } - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; switch (cmd) { @@ -694,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); case BCH_IOCTL_DISK_RESIZE_JOURNAL: BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - + case BCH_IOCTL_FSCK_ONLINE: + BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 13998388c5..1b8c2c1016 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ }) +static inline void bch2_csum_to_text(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum csum) +{ + const u8 *p = (u8 *) &csum; + unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; + + for (unsigned i = 0; i < bytes; i++) + prt_hex_byte(out, p[i]); +} + +static inline void bch2_csum_err_msg(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum expected, + struct bch_csum got) +{ + prt_printf(out, "checksum error: got "); + bch2_csum_to_text(out, type, got); + prt_str(out, " should be "); + bch2_csum_to_text(out, type, expected); + prt_printf(out, " type %s", bch2_csum_types[type]); +} + int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 51af8ea230..33df8cf86b 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); - /* - * ZSTD is lying: if we allocate the size of the workspace it says it - * requires, it returns memory allocation errors - */ c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); struct { diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232..58c2eb4557 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } +static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) +{ + if (type < BCH_COMPRESSION_TYPE_NR) + prt_str(out, __bch2_compression_types[type]); + else + prt_printf(out, "(invalid compression type %u)", type); +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index e367c625f0..4b340d13ca 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -20,6 +20,7 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; +typedef DARRAY(char *) darray_str; int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); @@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) +#define __darray_for_each(_d, _i) \ + for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) + #define darray_for_each(_d, _i) \ - for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 37d6ecae8c..4150feca42 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -267,19 +267,31 @@ restart_drop_extra_replicas: goto out; } + if (trace_data_update_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, - op->opts.background_target, - op->opts.background_compression) ?: + bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); @@ -300,14 +312,14 @@ next: } continue; nowork: - if (m->stats && m->stats) { + if (m->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, &m->stats->sectors_raced); } - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); + count_event(c, move_extent_fail); bch2_btree_iter_advance(&iter); goto next; @@ -342,7 +354,6 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) { if (c->opts.nocow_enabled) @@ -363,7 +374,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; - struct bch_extent_ptr *ptr; struct closure cl; struct btree_iter iter; struct bkey_s_c k; @@ -404,6 +414,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, continue; } + bch_err_fn_ratelimited(c, ret); + if (ret) return; @@ -476,7 +488,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, return bch2_trans_relock(trans) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } int bch2_data_update_init(struct btree_trans *trans, @@ -493,7 +505,6 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned ptrs_locked = 0; int ret = 0; @@ -516,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; bkey_for_each_ptr(ptrs, ptr) @@ -639,7 +650,6 @@ done: void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 57c5128db1..7bdba8507f 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_bytes(c)), + buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_bytes(c)); + bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref); - memcpy(n_ondisk, n_sorted, btree_bytes(c)); + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), + buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_bytes(c)); + kvpfree(n_ondisk, btree_buf_bytes(b)); percpu_ref_put(&ca->io_ref); } @@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations btree_debug_ops = { @@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - struct btree_path_level *l = &iter.path->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = + &btree_iter_path(trans, &iter)->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations bfloat_failed_debug_ops = { @@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -650,11 +627,11 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; -#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; -static int lock_held_stats_open(struct inode *inode, struct file *file) +static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; @@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) if (!i) return -ENOMEM; - i->iter = 0; + i->iter = 1; i->c = c; i->buf = PRINTBUF; file->private_data = i; @@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) return 0; } -static int lock_held_stats_release(struct inode *inode, struct file *file) +static int btree_transaction_stats_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file) return 0; } -static ssize_t lock_held_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, prt_printf(&i->buf, "Max mem used: %u", s->max_mem); prt_newline(&i->buf); + prt_printf(&i->buf, "Transaction duration:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->duration); + printbuf_indent_sub(&i->buf, 2); + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:"); prt_newline(&i->buf); @@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, return i->ret; } -static const struct file_operations lock_held_stats_op = { - .owner = THIS_MODULE, - .open = lock_held_stats_open, - .release = lock_held_stats_release, - .read = lock_held_stats_read, +static const struct file_operations btree_transaction_stats_op = { + .owner = THIS_MODULE, + .open = btree_transaction_stats_open, + .release = btree_transaction_stats_release, + .read = btree_transaction_stats_read, }; static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, @@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -850,7 +835,7 @@ restart: bch2_check_for_deadlock(trans, &i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); -#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &lock_held_stats_op); + c, &btree_transaction_stats_op); debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); @@ -947,8 +930,6 @@ void bch2_debug_exit(void) int __init bch2_debug_init(void) { - int ret = 0; - bch_debug = debugfs_create_dir("bcachefs", NULL); - return ret; + return 0; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 2bfff0da70..4ae1e9f002 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr *r_name = _r; - return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); + return !qstr_eq(l_name, *r_name); } static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr r_name = bch2_dirent_get_name(r); - return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); + return !qstr_eq(l_name, r_name); } static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) @@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } +int bch2_dirent_create_snapshot(struct btree_trans *trans, + u64 dir, u32 snapshot, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) +{ + subvol_inum zero_inum = { 0 }; + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + dirent->k.p.inode = dir; + dirent->k.p.snapshot = snapshot; + + ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, + zero_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + *dir_offset = dirent->k.p.offset; + + return ret; +} + int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, int flags) + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) { struct bkey_i_dirent *dirent; int ret; @@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); + dir, &dirent->k_i, str_hash_flags); *dir_offset = dirent->k.p.offset; return ret; @@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; -retry: - bch2_trans_begin(trans); + struct btree_iter iter = { NULL }; - ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, - name, inum, 0); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (!ret) - bch2_trans_iter_exit(trans, &iter); + int ret = lockrestart_do(trans, + __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 1e3431990a..21ffeb78f0 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); +int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, int); + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h new file mode 100644 index 0000000000..5e116b88e8 --- /dev/null +++ b/fs/bcachefs/dirent_format.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_FORMAT_H +#define _BCACHEFS_DIRENT_FORMAT_H + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 4d0cb0ccff..06a7df529b 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -89,19 +89,14 @@ err: void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_disk_groups_cpu *g; - struct bch_dev *ca; - int i; - unsigned iter; - out->atomic++; rcu_read_lock(); - g = rcu_dereference(c->disk_groups); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); if (!g) goto out; - for (i = 0; i < g->nr; i++) { + for (unsigned i = 0; i < g->nr; i++) { if (i) prt_printf(out, " "); @@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) } prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + for_each_member_device_rcu(c, ca, &g->entries[i].devs) prt_printf(out, " %s", ca->name); prt_printf(out, "]"); } @@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) : NULL; if (ca && percpu_ref_tryget(&ca->io_ref)) { - prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + prt_printf(out, "/dev/%s", ca->name); percpu_ref_put(&ca->io_ref); } else if (ca) { prt_printf(out, "offline device %u", t.dev); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 2a77de18c0..d503af2700 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -3,6 +3,7 @@ /* erasure coding */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" @@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, } } +/* Triggers: */ + +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + + if (deleting) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, + a->v.gen, a->v.data_type, + a->v.dirty_sectors); + if (ret) + goto err; + + if (!deleting) { + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; + } else { + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + + a->v.stripe = 0; + a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + } + + a->v.dirty_sectors += sectors; + if (data_type) + a->v.data_type = !deleting ? data_type : 0; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) +{ + struct bch_fs *c = trans->c; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, ptr); + + if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + + bucket_lock(g); + old = *g; + + ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors); + if (ret) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +int bch2_trigger_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s _new, + unsigned flags) +{ + struct bkey_s_c new = _new.s_c; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + /* + * If the pointers aren't changing, we don't need to do anything: + */ + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, new); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; + } + + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, old); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; + } + + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false); + if (ret) + return ret; + } + + if (old_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); + if (ret) + return ret; + } + } + } + + if (flags & BTREE_TRIGGER_ATOMIC) { + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m) { + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + bch2_stripes_heap_del(c, m, idx); + + memset(m, 0, sizeof(*m)); + } else { + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->ptrs[i] = new_s->ptrs[i]; + + bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (unsigned i = 0; i < new_s->nr_blocks; i++) { + int ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + + int ret = bch2_update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + 0, true); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + /* returns blocknr in stripe that we matched: */ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, struct bkey_s_c k, unsigned *block) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i, nr_data = s->nr_blocks - s->nr_redundant; bkey_for_each_ptr(ptrs, ptr) @@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - struct btree_trans *trans = bch2_trans_get(c); - int ret; - u64 idx; while (1) { mutex_lock(&c->ec_stripes_heap_lock); - idx = stripe_idx_to_delete(c); + u64 idx = stripe_idx_to_delete(c); mutex_unlock(&c->ec_stripes_heap_lock); if (!idx) break; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_delete(trans, idx)); - if (ret) { - bch_err_fn(c, ret); + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + ec_stripe_delete(trans, idx)); + bch_err_fn(c, ret); + if (ret) break; - } } - bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b while (1) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, s, &bp_pos)); if (ret) @@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s) } ret = bch2_trans_do(c, &s->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, bkey_i_to_stripe(&s->new_stripe.key), !s->have_existing_stripe)); + bch_err_msg(c, ret, "creating stripe key"); if (ret) { - bch_err(c, "error creating stripe: error creating stripe key"); goto err; } ret = ec_stripe_update_extents(c, &s->new_stripe); - if (ret) { - bch_err_msg(c, ret, "creating stripe: error updating pointers"); + bch_err_msg(c, ret, "error updating extents"); + if (ret) goto err; - } err: bch2_disk_reservation_put(c, &s->res); @@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r) static unsigned pick_blocksize(struct bch_fs *c, struct bch_devs_mask *devs) { - struct bch_dev *ca; - unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; struct { unsigned nr, size; } cur = { 0, 0 }, best = { 0, 0 }; - for_each_member_device_rcu(ca, c, i, devs) + for_each_member_device_rcu(c, ca, devs) sizes[nr++] = ca->mi.bucket_size; sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - for (i = 0; i < nr; i++) { + for (unsigned i = 0; i < nr; i++) { if (sizes[i] != cur.size) { if (cur.nr > best.nr) best = cur; @@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, enum bch_watermark watermark) { struct ec_stripe_head *h; - struct bch_dev *ca; - unsigned i; h = kzalloc(sizeof(*h), GFP_KERNEL); if (!h) @@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) - __clear_bit(i, h->devs.d); + __clear_bit(ca->dev_idx, h->devs.d); h->blocksize = pick_blocksize(c, &h->devs); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); goto found; } @@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_stripe *s; - struct stripe *m; - unsigned i; - int ret; - - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + if (k.k->type != KEY_TYPE_stripe) + continue; - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; - bch2_stripes_heap_insert(c, m, k.k->p.offset); - } - bch2_trans_iter_exit(trans, &iter); + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - bch2_trans_put(trans); + struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; - if (ret) - bch_err_fn(c, ret); + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + 0; + }))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 7d0237c981..f4369b02e8 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_stripe, \ - .atomic_trigger = bch2_mark_stripe, \ + .trigger = bch2_trigger_stripe, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h new file mode 100644 index 0000000000..44ce88ba08 --- /dev/null +++ b/fs/bcachefs/ec_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_FORMAT_H +#define _BCACHEFS_EC_FORMAT_H + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index e2b02a82de..976426da3a 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -5,7 +5,7 @@ #include "bcachefs_format.h" struct bch_replicas_padded { - struct bch_replicas_entry e; + struct bch_replicas_entry_v1 e; u8 pad[BCH_BKEY_PTRS_MAX]; }; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ac90faccdc..8c40c2067a 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -73,7 +73,6 @@ x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_journal_replay) \ x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \ @@ -152,7 +151,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ @@ -172,10 +170,12 @@ x(EINVAL, device_size_too_small) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ + x(EINVAL, opt_parse_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -225,6 +225,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ + x(EIO, btree_write_all_failed) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -236,6 +237,7 @@ x(BCH_ERR_nopromote, nopromote_unwritten) \ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) enum bch_errcode { diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 25cf78a7b9..d32c8bebe4 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -2,12 +2,13 @@ #include "bcachefs.h" #include "error.h" #include "super.h" +#include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 bool bch2_inconsistent_error(struct bch_fs *c) { - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_error, &c->flags); switch (c->opts.errors) { case BCH_ON_ERROR_continue: @@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + set_bit(BCH_FS_topology_error, &c->flags); + if (!test_bit(BCH_FS_fsck_running, &c->flags)) bch2_inconsistent_error(c); } @@ -69,40 +70,66 @@ enum ask_yn { YN_ALLYES, }; +static enum ask_yn parse_yn_response(char *buf) +{ + buf = strim(buf); + + if (strlen(buf) == 1) + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + return -1; +} + #ifdef __KERNEL__ -#define bch2_fsck_ask_yn() YN_NO +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + + if (!stdio) + return YN_NO; + + char buf[100]; + int ret; + + do { + bch2_print(c, " (y,n, or Y,N for all errors of this type) "); + + int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); + if (r < 0) + return YN_NO; + buf[r] = '\0'; + } while ((ret = parse_yn_response(buf)) < 0); + + return ret; +} #else #include "tools-util.h" -enum ask_yn bch2_fsck_ask_yn(void) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) { char *buf = NULL; size_t buflen = 0; - bool ret; + int ret; - while (true) { + do { fputs(" (y,n, or Y,N for all errors of this type) ", stdout); fflush(stdout); if (getline(&buf, &buflen, stdin) < 0) die("error reading from standard input"); - - strim(buf); - if (strlen(buf) != 1) - continue; - - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - } + } while ((ret = parse_yn_response(buf)) < 0); free(buf); return ret; @@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { struct fsck_err_state *s; - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) @@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c, struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; - if (test_bit(err, c->sb.errors_silent)) + if ((flags & FSCK_CAN_FIX) && + test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; bch2_sb_error_count(c, err); @@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); @@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c, int ask; prt_str(out, ": fix?"); - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(); + ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - if (print) - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (print) { + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s\n", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); + } - if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + if (test_bit(BCH_FS_fsck_running, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) bch_err(c, "Unable to continue, halting"); @@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c, bch2_inconsistent_error(c); if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } else { - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } return ret; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 21af6fb8ce..b9033bb4f1 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 9d8afcb597..61395b113d 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -843,7 +844,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->dev == dev) @@ -855,7 +855,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && @@ -1020,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type], - bch2_compression_types[crc.compression_type]); + bch2_csum_types[crc.csum_type]); + bch2_prt_compression_type(out, crc.compression_type); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1065,7 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; u64 bucket; u32 bucket_offset; struct bch_dev *ca; @@ -1307,7 +1305,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, } incompressible: if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1338,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - unsigned target, unsigned compression) + struct bch_io_opts *opts) { struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; + unsigned target = opts->background_target; + unsigned compression = background_compression(*opts); bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a2ce8a3be1..6bf839d69e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) bkey_extent_entry_for_each_from(_p, _entry, _p.start) #define __bkey_for_each_ptr(_start, _end, _ptr) \ - for ((_ptr) = (_start); \ + for (typeof(_start) (_ptr) = (_start); \ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ (_ptr)++) @@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .key_invalid = bch2_btree_ptr_invalid, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ @@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ .min_val_size = 40, \ }) @@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) /* KEY_TYPE_reservation: */ @@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_invalid = bch2_reservation_invalid, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ - .trans_trigger = bch2_trans_mark_reservation, \ - .atomic_trigger = bch2_mark_reservation, \ + .trigger = bch2_trigger_reservation, \ .min_val_size = 8, \ }) @@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->unwritten) @@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (!ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -716,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - unsigned, unsigned); + struct bch_io_opts *); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h new file mode 100644 index 0000000000..3bd2fdbb08 --- /dev/null +++ b/fs/bcachefs/extents_format.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_FORMAT_H +#define _BCACHEFS_EXTENTS_FORMAT_H + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + +#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 05429c9631..b04750dbf8 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) } #define eytzinger1_for_each(_i, _size) \ - for ((_i) = eytzinger1_first((_size)); \ + for (unsigned (_i) = eytzinger1_first((_size)); \ (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) @@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) } #define eytzinger0_for_each(_i, _size) \ - for ((_i) = eytzinger0_first((_size)); \ + for (unsigned (_i) = eytzinger0_first((_size)); \ (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) @@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ - void *_search = (search); \ - size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + void *_base = (base); \ + const void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ int _res; \ \ while (_i < _nr && \ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 4496cf91a4..1c1ea0f0c6 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; } inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; @@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - } + inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); @@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } + if (mode == BCH_RENAME_OVERWRITE && + dst_inode_u->bi_dir == dst_dir_u->bi_inum && + dst_inode_u->bi_dir_offset == src_offset) { + dst_inode_u->bi_dir = 0; + dst_inode_u->bi_dir_offset = 0; } if (mode == BCH_RENAME_OVERWRITE) { diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 52f0e7acda..27710cdd57 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -52,26 +52,20 @@ struct readpages_iter { static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { - struct folio **fi; - int ret; - - memset(iter, 0, sizeof(*iter)); + struct folio *folio; - iter->mapping = ractl->mapping; + *iter = (struct readpages_iter) { ractl->mapping }; - ret = bch2_filemap_get_contig_folios_d(iter->mapping, - ractl->_index << PAGE_SHIFT, - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, - 0, mapping_gfp_mask(iter->mapping), - &iter->folios); - if (ret) - return ret; + while ((folio = __readahead_folio(ractl))) { + if (!bch2_folio_create(folio, GFP_KERNEL) || + darray_push(&iter->folios, folio)) { + bch2_folio_release(folio); + ractl->_nr_pages += folio_nr_pages(folio); + ractl->_index -= folio_nr_pages(folio); + return iter->folios.nr ? 0 : -ENOMEM; + } - darray_for_each(iter->folios, fi) { - ractl->_nr_pages -= 1U << folio_order(*fi); - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); - folio_put(*fi); - folio_put(*fi); + folio_put(folio); } return 0; @@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl) struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; - int ret; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - ret = readpages_iter_init(&readpages_iter, ractl); - BUG_ON(ret); + int ret = readpages_iter_init(&readpages_iter, ractl); + if (ret) + return; bch2_pagecache_add_get(inode); @@ -309,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl) darray_exit(&readpages_iter.folios); } -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); -} - static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); @@ -335,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) int ret; DECLARE_COMPLETION_ONSTACK(done); + if (!bch2_folio_create(folio, GFP_KERNEL)) + return -ENOMEM; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), @@ -342,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -638,7 +627,7 @@ do_io: /* Check for writing past i_size: */ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + !test_bit(BCH_FS_emergency_ro, &c->flags), "writing past i_size: %llu > %llu (unrounded %llu)\n", bio_end_sector(&w->io->op.wbio.bio) << 9, round_up(i_size, block_bytes(c)), @@ -826,7 +815,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; folios fs; - struct folio **fi, *f; + struct folio *f; unsigned copied = 0, f_offset, f_copied; u64 end = pos + len, f_pos, f_len; loff_t last_folio_pos = inode->v.i_size; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 84e20c3ada..33cb6da3a5 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); - if ((offset|iter->count) & (block_bytes(c) - 1)) + /* bios must be 512 byte aligned: */ + if ((offset|iter->count) & (SECTOR_SIZE - 1)) return -EINVAL; ret = min_t(loff_t, iter->count, @@ -87,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) return ret; shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + if (shorten >= iter->count) + shorten = 0; iter->count -= shorten; bio = bio_alloc_bioset(NULL, diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index ff664fd0d8..d359aa9b33 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, } } -void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) +int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 *start, u64 end, + bool nonblocking) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t index = *start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; struct folio_batch fbatch; s64 i_sectors_delta = 0; - unsigned i, j; + int ret = 0; - if (end <= start) - return; + if (end <= *start) + return 0; folio_batch_init(&fbatch); while (filemap_get_folios(inode->v.i_mapping, &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; + + if (!nonblocking) + folio_lock(folio); + else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + ret = -EAGAIN; + break; + } + u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; BUG_ON(end <= folio_start); - folio_lock(folio); - s = bch2_folio(folio); + *start = min(end, folio_end); + struct bch_folio *s = bch2_folio(folio); if (s) { + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); @@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + return ret; } static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 27f712ae37..8cbaba6565 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); +int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); int bch2_get_folio_disk_reservation(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index fffa1743df..8c70123b6a 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2, ret3; + int ret; ret = file_write_and_wait_range(file, start, end); - ret2 = sync_inode_metadata(&inode->v, 1); - ret3 = bch2_flush_inode(c, inode); - - return bch2_err_class(ret ?: ret2 ?: ret3); + if (ret) + goto out; + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + goto out; + ret = bch2_flush_inode(c, inode); +out: + return bch2_err_class(ret); } /* truncate: */ @@ -671,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(trans, - (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + if (bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, true)) + drop_locks_do(trans, + bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, false)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -861,7 +868,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + lock_two_nondirectories(&src->v, &dst->v); + bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -914,7 +922,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); + unlock_two_nondirectories(&src->v, &dst->v); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index dbc87747ea..3dc8630ff9 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) bch_notice(c, "shutdown by ioctl type %u", flags); - down_write(&c->vfs_sb->s_umount); - switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(c->vfs_sb->s_bdev); + ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) - goto err; - + break; bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev); + bdev_thaw(c->vfs_sb->s_bdev); break; - case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); break; default: ret = -EINVAL; break; } -err: - up_write(&c->vfs_sb->s_umount); + return ret; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 49da8db1d9..77ae65542d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -93,7 +93,7 @@ retry: BTREE_ITER_INTENT) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; @@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, bch2_subvol_is_ro(c, inode->ei_subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) - return ret; + return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); @@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, @@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: __bch2_unlink(vdir, dentry, false); + return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, return 0; err: iput(&inode->v); - return ret; + return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, @@ -641,7 +642,7 @@ err: src_inode, dst_inode); - return ret; + return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, @@ -729,7 +730,7 @@ retry: ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1012,15 +1013,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; if (!dir_emit_dots(file, ctx)) return 0; - ret = bch2_readdir(c, inode_inum(inode), ctx); - if (ret) - bch_err_fn(c, ret); + int ret = bch2_readdir(c, inode_inum(inode), ctx); + bch_err_fn(c, ret); return bch2_err_class(ret); } @@ -1131,7 +1130,7 @@ static const struct address_space_operations bch_address_space_operations = { #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { @@ -1500,7 +1499,7 @@ static void bch2_evict_inode(struct inode *vinode) void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct bch_inode_info *inode, **i; + struct bch_inode_info *inode; DARRAY(struct bch_inode_info *) grabbed; bool clean_pass = false, this_pass_clean; @@ -1626,43 +1625,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static char **split_devs(const char *_dev_name, unsigned *nr) -{ - char *dev_name = NULL, **devs = NULL, *s; - size_t i = 0, nr_devs = 0; - - dev_name = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return NULL; - - for (s = dev_name; s; s = strchr(s + 1, ':')) - nr_devs++; - - devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); - if (!devs) { - kfree(dev_name); - return NULL; - } - - while ((s = strsep(&dev_name, ":"))) - devs[i++] = s; - - *nr = nr_devs; - return devs; -} - static int bch2_remount(struct super_block *sb, int *flags, char *data) { struct bch_fs *c = sb->s_fs_info; struct bch_opts opts = bch2_opts_empty(); int ret; - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - ret = bch2_parse_mount_opts(c, &opts, data); if (ret) goto err; + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); @@ -1696,11 +1670,9 @@ err: static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - struct bch_dev *ca; - unsigned i; bool first = true; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (!first) seq_putc(seq, ':'); first = false; @@ -1770,7 +1742,7 @@ static int bch2_unfreeze(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; int ret; - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + if (test_bit(BCH_FS_emergency_ro, &c->flags)) return 0; down_write(&c->state_lock); @@ -1805,17 +1777,18 @@ static int bch2_noset_super(struct super_block *s, void *data) return -EBUSY; } +typedef DARRAY(struct bch_fs *) darray_fs; + static int bch2_test_super(struct super_block *s, void *data) { struct bch_fs *c = s->s_fs_info; - struct bch_fs **devs = data; - unsigned i; + darray_fs *d = data; if (!c) return false; - for (i = 0; devs[i]; i++) - if (c != devs[i]) + darray_for_each(*d, i) + if (c != *i) return false; return true; } @@ -1824,13 +1797,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct bch_fs *c; - struct bch_dev *ca; struct super_block *sb; struct inode *vinode; struct bch_opts opts = bch2_opts_empty(); - char **devs; - struct bch_fs **devs_to_fs = NULL; - unsigned i, nr_devs; int ret; opt_set(opts, read_only, (flags & SB_RDONLY) != 0); @@ -1842,25 +1811,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); - devs = split_devs(dev_name, &nr_devs); - if (!devs) - return ERR_PTR(-ENOMEM); + darray_str devs; + ret = bch2_split_devs(dev_name, &devs); + if (ret) + return ERR_PTR(ret); - devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); - if (!devs_to_fs) { - sb = ERR_PTR(-ENOMEM); - goto got_sb; + darray_fs devs_to_fs = {}; + darray_for_each(devs, i) { + ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); + if (ret) { + sb = ERR_PTR(ret); + goto got_sb; + } } - for (i = 0; i < nr_devs; i++) - devs_to_fs[i] = bch2_path_to_fs(devs[i]); - - sb = sget(fs_type, bch2_test_super, bch2_noset_super, - flags|SB_NOSEC, devs_to_fs); + sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs, nr_devs, opts); + c = bch2_fs_open(devs.data, devs.nr, opts); if (IS_ERR(c)) { sb = ERR_CAST(c); goto got_sb; @@ -1880,9 +1849,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) bch2_fs_stop(c); got_sb: - kfree(devs_to_fs); - kfree(devs[0]); - kfree(devs); + darray_exit(&devs_to_fs); + bch2_darray_str_exit(&devs); if (IS_ERR(sb)) { ret = PTR_ERR(sb); @@ -1923,7 +1891,7 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ @@ -1944,10 +1912,9 @@ got_sb: vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); - if (ret) { - bch_err_msg(c, ret, "mounting: error getting root inode"); + bch_err_msg(c, ret, "mounting: error getting root inode"); + if (ret) goto err_put_super; - } sb->s_root = d_make_root(vinode); if (!sb->s_root) { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 5edf1d4b9e..c3af7225ff 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r) } enum bch_inode_lock_op { - INODE_LOCK = (1U << 0), - INODE_PAGECACHE_BLOCK = (1U << 1), - INODE_UPDATE_LOCK = (1U << 2), + INODE_PAGECACHE_BLOCK = (1U << 0), + INODE_UPDATE_LOCK = (1U << 1), }; #define bch2_lock_inodes(_locks, ...) \ @@ -91,8 +90,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ @@ -109,8 +106,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e0c5cd119a..6a760777ba 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -20,8 +20,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -29,19 +27,16 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; u64 sectors = 0; - int ret; - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), - 0, k, ret) + 0, k, ({ if (bkey_extent_is_allocation(k.k)) sectors += k.k->size; - - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: sectors; } @@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; u64 subdirs = 0; - int ret; - - for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_DIR) + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ({ + if (k.k->type == KEY_TYPE_dirent && + bkey_s_c_to_dirent(k).v->d_type == DT_DIR) subdirs++; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: subdirs; } -static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, - u32 *subvol) -{ - struct bch_snapshot s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, - POS(0, snapshot), 0, - snapshot, &s); - if (!ret) - *subvol = le32_to_cpu(s.subvol); - else if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", snapshot); - return ret; - -} - -static int __subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) { struct bch_subvolume s; int ret; @@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); -} - static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode) { @@ -132,7 +99,7 @@ err: return ret; } -static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) { @@ -152,29 +119,19 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); -} - -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -207,12 +164,9 @@ static int fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __write_inode(trans, inode, snapshot)); - if (ret) - bch_err_fn(trans->c, ret); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); return ret; } @@ -242,35 +196,44 @@ err: } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 subvol, +static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; - struct bch_inode_unpacked root; - struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - subvol_inum root_inum = { .subvol = subvol }; u64 inum = 0; unsigned d_type = 0; - u32 snapshot; int ret; - ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + struct bch_snapshot_tree st; + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, snapshot), &st); + if (ret) + return ret; + + subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; + u32 subvol_snapshot; + + ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), + &subvol_snapshot, &root_inum.inum); + bch_err_msg(c, ret, "looking up root subvol"); if (ret) return ret; - ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + struct bch_inode_unpacked root_inode; + struct bch_hash_info root_hash_info; + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); + bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; - root_hash_info = bch2_hash_info_init(c, &root); + root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); - if (bch2_err_matches(ret, ENOENT)) { - bch_notice(c, "creating lost+found"); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); + if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; - } bch_err_fn(c, ret); if (ret) @@ -285,20 +248,53 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return __lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, lostfound, &snapshot); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; create_lostfound: + /* + * XXX: we could have a nicer log message here if we had a nice way to + * walk backpointers to print a path + */ + bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); + + u64 now = bch2_current_time(c); + struct btree_iter lostfound_iter = { NULL }; + u64 cpu = raw_smp_processor_id(); + bch2_inode_init_early(c, lostfound); + bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + lostfound->bi_dir = root_inode.bi_inum; + + root_inode.bi_nlink++; + + ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); + if (ret) + goto err; - ret = bch2_create_trans(trans, root_inum, &root, - lostfound, &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL, - (subvol_inum) { }, 0); + bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(&lostfound_iter); + if (ret) + goto err; + + ret = bch2_dirent_create_snapshot(trans, + root_inode.bi_inum, snapshot, &root_hash_info, + mode_to_type(lostfound->bi_mode), + &lostfound_str, + lostfound->bi_inum, + &lostfound->bi_dir_offset, + BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write_flags(trans, &lostfound_iter, lostfound, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: bch_err_msg(c, ret, "creating lost+found"); + bch2_trans_iter_exit(trans, &lostfound_iter); return ret; } -static int __reattach_inode(struct btree_trans *trans, +static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 inode_snapshot) { @@ -307,14 +303,9 @@ static int __reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; - u32 subvol; int ret; - ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); - if (ret) - return ret; - - ret = lookup_lostfound(trans, subvol, &lostfound); + ret = lookup_lostfound(trans, inode_snapshot, &lostfound); if (ret) return ret; @@ -331,15 +322,12 @@ static int __reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); - ret = bch2_dirent_create(trans, - (subvol_inum) { - .subvol = subvol, - .inum = lostfound.bi_inum, - }, - &dir_hash, - inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + ret = bch2_dirent_create_snapshot(trans, + lostfound.bi_inum, inode_snapshot, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -349,18 +337,6 @@ static int __reattach_inode(struct btree_trans *trans, return __write_inode(trans, inode, inode_snapshot); } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 inode_snapshot) -{ - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - __reattach_inode(trans, inode, inode_snapshot)); - bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); - return ret; -} - static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { @@ -405,7 +381,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s }; int ret = 0; - darray_for_each(s->ids, i) { + __darray_for_each(s->ids, i) { if (i->id == id) return 0; if (i->id > id) @@ -422,7 +398,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry *i, n = { + struct snapshots_seen_entry n = { .id = pos.snapshot, .equiv = bch2_snapshot_equiv(c, pos.snapshot), }; @@ -448,7 +424,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } @@ -593,14 +569,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - u32 restart_count = trans->restart_count; int ret; w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset != inum) break; @@ -613,8 +588,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; - - return trans_was_restarted(trans, restart_count); + return 0; } static struct inode_walker_entry * @@ -625,7 +599,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, snapshot = bch2_snapshot_equiv(c, snapshot); - darray_for_each(w->inodes, i) + __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -667,11 +641,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, if (ret) return ERR_PTR(ret); } else if (bkey_cmp(w->last_pos, pos)) { - struct inode_walker_entry *i; - darray_for_each(w->inodes, i) i->seen_this_pos = false; - } w->last_pos = pos; @@ -756,9 +727,7 @@ static int hash_redo_key(struct btree_trans *trans, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static int hash_check_key(struct btree_trans *trans, @@ -826,6 +795,18 @@ fsck_err: goto out; } +static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + bch2_trans_iter_exit(trans, &iter); + return k.k->type == KEY_TYPE_set; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -867,7 +848,7 @@ static int check_inode(struct btree_trans *trans, c, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); - return -EINVAL; + return -BCH_ERR_fsck_repair_unimplemented; } if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && @@ -890,14 +871,22 @@ static int check_inode(struct btree_trans *trans, return 0; } + if (u.bi_flags & BCH_INODE_unlinked) { + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + return ret; + + fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); + ret = 0; + } + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || fsck_err(c, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); return ret; @@ -910,9 +899,6 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away @@ -976,27 +962,22 @@ fsck_err: return ret; } -noinline_for_stack int bch2_check_inodes(struct bch_fs *c) { bool full = c->opts.fsck; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; - struct bkey_s_c k; - int ret; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_inode(trans, &iter, k, &prev, &s, full)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_inode(trans, &iter, k, &prev, &s, full))); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } @@ -1023,29 +1004,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d, : le64_to_cpu(d.v->d_inum) == inode->bi_inum; } -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; - - d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - ret = bkey_err(d); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - ret = dirent_points_to_inode(d, inode); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1094,11 +1055,8 @@ struct extent_ends { static void extent_ends_reset(struct extent_ends *extent_ends) { - struct extent_end *i; - darray_for_each(extent_ends->e, i) snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; } @@ -1130,7 +1088,7 @@ static int extent_ends_at(struct bch_fs *c, if (!n.seen.ids.data) return -BCH_ERR_ENOMEM_fsck_extent_ends_at; - darray_for_each(extent_ends->e, i) { + __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { snapshots_seen_exit(&i->seen); *i = n; @@ -1220,13 +1178,12 @@ static int overlapping_extents_found(struct btree_trans *trans, swap(k1, k2); } - trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); if (ret) @@ -1270,7 +1227,6 @@ static int check_overlapping_extents(struct btree_trans *trans, bool *fixed) { struct bch_fs *c = trans->c; - struct extent_end *i; int ret = 0; /* transaction restart, running again */ @@ -1451,32 +1407,28 @@ int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct extent_ends extent_ends; struct disk_reservation res = { 0 }; - int ret = 0; snapshots_seen_init(&s); extent_ends_init(&extent_ends); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors(trans, &w); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); + })) ?: + check_i_sectors(trans, &w)); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); inode_walker_exit(&w); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1484,24 +1436,19 @@ int bch2_check_extents(struct bch_fs *c) int bch2_check_indirect_extents(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct disk_reservation res = { 0 }; - int ret = 0; - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_PREFETCH, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - })); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + }))); bch2_disk_reservation_put(c, &res); - bch2_trans_put(trans); - bch_err_fn(c, ret); return ret; } @@ -1509,7 +1456,6 @@ int bch2_check_indirect_extents(struct bch_fs *c) static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1553,8 +1499,8 @@ static int check_dirent_target(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_dirent *n; - bool backpointer_exists = true; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (!target->bi_dir && @@ -1568,25 +1514,37 @@ static int check_dirent_target(struct btree_trans *trans, } if (!inode_points_to_dirent(target, d)) { - ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); - if (ret < 0) + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - backpointer_exists = ret; + bool backpointer_exists = !ret; ret = 0; + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, c, inode_dir_multiple_links, - "directory %llu with multiple links", - target->bi_inum)) { + "directory %llu:%u with multiple links\n%s", + target->bi_inum, target_snapshot, buf.buf)) { ret = __remove_dirent(trans, d.k->p); goto out; } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ if (fsck_err_on(backpointer_exists && !target->bi_nlink, c, inode_multiple_links_but_nlink_0, - "inode %llu type %s has multiple links but i_nlink 0", - target->bi_inum, bch2_d_types[d.v->d_type])) { + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { target->bi_nlink++; target->bi_flags &= ~BCH_INODE_unlinked; @@ -1636,13 +1594,12 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } - if (d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && - (c->sb.version < bcachefs_metadata_version_subvol_dirent || - fsck_err(c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol))) { + if (fsck_err_on(d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), + c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol)) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -1660,6 +1617,7 @@ static int check_dirent_target(struct btree_trans *trans, out: err: fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -1701,7 +1659,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - BUG_ON(!iter->path->should_be_locked); + BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); ret = PTR_ERR_OR_ZERO(i); @@ -1754,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, u32 target_snapshot; u64 target_inum; - ret = __subvol_lookup(trans, target_subvol, + ret = subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1766,7 +1724,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - ret = __lookup_inode(trans, target_inum, + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1842,22 +1800,18 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); - bch2_trans_put(trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -1908,8 +1862,6 @@ int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - struct btree_iter iter; - struct bkey_s_c k; int ret = 0; ret = bch2_trans_run(c, @@ -1918,7 +1870,7 @@ int bch2_check_xattrs(struct bch_fs *c) BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); bch_err_fn(c, ret); return ret; @@ -1932,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans) u64 inum; int ret; - ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1948,18 +1900,13 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.flags = 0; root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, - &root_subvol.k_i, 0)); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; - } - ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1983,11 +1930,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret; - - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2002,13 +1945,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf; static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { - struct pathbuf_entry *i; - darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; - return false; } @@ -2057,10 +1997,10 @@ static int check_path(struct btree_trans *trans, break; } - ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot))).k)); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot)); + ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; @@ -2097,13 +2037,12 @@ static int check_path(struct btree_trans *trans, ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); if (ret) { /* Should have been caught in dirents pass */ - bch_err(c, "error looking up parent directory: %i", ret); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error looking up parent directory: %i", ret); break; } if (path_is_dup(p, inode->bi_inum, snapshot)) { - struct pathbuf_entry *i; - /* XXX print path */ bch_err(c, "directory structure loop"); @@ -2111,20 +2050,19 @@ static int check_path(struct btree_trans *trans, pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); - if (!fsck_err(c, dir_loop, - "directory structure loop")) + if (!fsck_err(c, dir_loop, "directory structure loop")) return 0; - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - remove_backpointer(trans, inode)); - if (ret) { - bch_err(c, "error removing dirent: %i", ret); + ret = remove_backpointer(trans, inode); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "removing dirent"); + if (ret) break; - } ret = reattach_inode(trans, inode, snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + break; } } fsck_err: @@ -2139,37 +2077,28 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (!bkey_is_inode(k.k)) + continue; - ret = bch2_inode_unpack(k, &u); - if (ret) { - /* Should have been caught earlier in fsck: */ - bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); - break; - } + BUG_ON(bch2_inode_unpack(k, &u)); - if (u.bi_flags & BCH_INODE_unlinked) - continue; + if (u.bi_flags & BCH_INODE_unlinked) + continue; - ret = check_path(trans, &path, &u, iter.pos.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + check_path(trans, &path, &u, iter.pos.snapshot); + }))); darray_exit(&path); + bch_err_fn(c, ret); return ret; } @@ -2255,47 +2184,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked u; - int ret = 0; - - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(k, &u)); - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + if (!bkey_is_inode(k.k)) + continue; - if (!u.bi_nlink) - continue; + /* Should never fail, checked by bch2_inode_invalid: */ + struct bch_inode_unpacked u; + BUG_ON(bch2_inode_unpack(k, &u)); - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + if (!u.bi_nlink) + continue; - if (ret) - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + 0; + }))); + bch_err_fn(c, ret); return ret; } @@ -2303,42 +2224,34 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - int ret; snapshots_seen_init(&s); - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - switch (k.k->type) { - case KEY_TYPE_dirent: - d = bkey_s_c_to_dirent(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); - break; - } - } - bch2_trans_iter_exit(trans, &iter); + if (k.k->type == KEY_TYPE_dirent) { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (ret) - bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + } + 0; + }))); - bch2_trans_put(trans); snapshots_seen_exit(&s); + + bch_err_fn(c, ret); return ret; } @@ -2389,19 +2302,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); return ret; } @@ -2447,7 +2357,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, { struct bkey_s_c_reflink_p p; struct bkey_i_reflink_p *u; - int ret; if (k.k->type != KEY_TYPE_reflink_p) return 0; @@ -2458,7 +2367,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, return 0; u = bch2_trans_kmalloc(trans, sizeof(*u)); - ret = PTR_ERR_OR_ZERO(u); + int ret = PTR_ERR_OR_ZERO(u); if (ret) return ret; @@ -2471,19 +2380,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, int bch2_fix_reflink_p(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) return 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 9309cfeecd..086f0090b0 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -506,22 +506,33 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode=%o ", inode->bi_mode); + printbuf_indent_add(out, 2); + prt_printf(out, "mode=%o", inode->bi_mode); + prt_newline(out); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); prt_printf(out, " (%x)", inode->bi_flags); + prt_newline(out); - prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", - inode->bi_journal_seq, - inode->bi_size, - inode->bi_sectors, - inode->bi_version); + prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_newline(out); + + prt_printf(out, "bi_size=%llu", inode->bi_size); + prt_newline(out); + + prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "bi_version=%llu", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + prt_printf(out, #_name "=%llu", (u64) inode->_name); \ + prt_newline(out); BCH_INODE_FIELDS_v3() #undef x + printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) @@ -561,64 +572,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k) return bkey_inode_flags(k) & BCH_INODE_unlinked; } -int bch2_trans_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +int bch2_trigger_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) { - int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); - bool old_deleted = bkey_is_deleted_inode(old); - bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if (nr) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - struct replicas_delta_list *d = trans->fs_usage_deltas; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - if (ret) - return ret; + trans->fs_usage_deltas->nr_inodes += nr; + } - d->nr_inodes += nr; + bool old_deleted = bkey_is_deleted_inode(old); + bool new_deleted = bkey_is_deleted_inode(new.s_c); + if (old_deleted != new_deleted) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + if (ret) + return ret; + } } - if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); - if (ret) - return ret; - } + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + BUG_ON(!trans->journal_res.seq); - return 0; -} - -int bch2_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - u64 journal_seq = trans->journal_res.seq; - - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; - - BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_inode_v3); - - v->bi_journal_seq = cpu_to_le64(journal_seq); + bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } if (flags & BTREE_TRIGGER_GC) { - percpu_down_read(&c->mark_lock); - preempt_disable(); + struct bch_fs *c = trans->c; - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); - fs_usage->nr_inodes += bkey_is_inode(new.k); - fs_usage->nr_inodes -= bkey_is_inode(old.k); - - preempt_enable(); + percpu_down_read(&c->mark_lock); + this_cpu_add(c->usage_gc->b.nr_inodes, nr); percpu_up_read(&c->mark_lock); } + return 0; } @@ -831,7 +824,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; @@ -894,7 +887,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1058,7 +1051,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1155,51 +1148,48 @@ delete: int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; bool need_another_pass; int ret; again: need_another_pass = false; - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); - if (ret < 0) - break; - - if (ret) { - if (!test_bit(BCH_FS_RW, &c->flags)) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - } - + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; + /* + * We don't want to loop here: a transaction restart + * error here means we handled a transaction restart and + * we're actually done, but if we loop we'll retry the + * same key because the write buffer hasn't been flushed + * yet + */ + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } } - } - bch2_trans_iter_exit(trans, &iter); - if (!ret && need_another_pass) + ret; + })); + + if (!ret && need_another_pass) { + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; goto again; + } err: bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 88818a332b..b63f312581 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v2_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v3_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ }) diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h new file mode 100644 index 0000000000..83d107331e --- /dev/null +++ b/fs/bcachefs/inode_format.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_FORMAT_H +#define _BCACHEFS_INODE_FORMAT_H + +#define BLOCKDEV_INODE_MAX 4096 +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index bebc11444e..1baf78594c 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct open_buckets open_buckets = { 0 }; struct bkey_s_c k; struct bkey_buf old, new; - unsigned sectors_allocated = 0; - bool have_reservation = false; + unsigned sectors_allocated = 0, new_replicas; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; int ret; @@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans, return ret; sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + new_replicas = max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err; - - bch2_bkey_buf_reassemble(&old, c, k); - } + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err_noprint; - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto err; + bch2_bkey_buf_reassemble(&old, c, k); - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { + if (!unwritten) { struct bkey_i_reservation *reservation; bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); @@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct bkey_i_extent *e; struct bch_devs_list devs_have; struct write_point *wp; - struct bch_extent_ptr *ptr; devs_have.nr = 0; @@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans, ptr->unwritten = true; } - have_reservation = true; - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 0, i_sectors_delta, true); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - + if (should_print_err(ret)) + bch_err_inum_offset_ratelimited(c, + inum.inum, + iter->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); +err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); @@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, u64 new_i_size = le64_to_cpu(op->v.new_i_size); int ret; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, truncate_set_isize(trans, inum, new_i_size)); if (ret) goto err; @@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start: op->v.state = LOGGED_OP_FINSERT_shift_extents; if (insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, len) ?: bch2_logged_op_update(trans, &op->k_i)); if (ret) @@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto err; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_logged_op_update(trans, &op->k_i)); } @@ -449,13 +442,11 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_disk_reservation_put(c, &disk_res); @@ -470,12 +461,12 @@ btree_err: op->v.state = LOGGED_OP_FINSERT_finish; if (!insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, shift) ?: bch2_logged_op_update(trans, &op->k_i)); } else { /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, 0, 0) ?: bch2_logged_op_update(trans, &op->k_i)); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 36763865fa..3c574d8873 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -80,7 +80,7 @@ struct promote_op { struct bpos pos; struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ + struct bio_vec bi_inline_vecs[]; /* must be last */ }; static const struct rhashtable_params bch_promote_params = { @@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, int ret; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!op) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } op->start_time = local_clock(); op->pos = pos; @@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, */ *rbio = kzalloc(sizeof(struct bch_read_bio) + sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) + GFP_KERNEL); + if (!*rbio) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } rbio_init(&(*rbio)->bio, opts); bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } (*rbio)->bounce = true; (*rbio)->split = true; (*rbio)->kmalloc = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) + bch_promote_params)) { + ret = -BCH_ERR_nopromote_in_flight; goto err; + } bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); @@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); goto err; } @@ -239,7 +245,7 @@ err: *rbio = NULL; kfree(op); bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; + return ERR_PTR(ret); } noinline @@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, ? BTREE_ID_reflink : BTREE_ID_extents, k, pos, pick, opts, sectors, rbio); - if (!promote) { - ret = -BCH_ERR_nopromote_enomem; + ret = PTR_ERR_OR_ZERO(promote); + if (ret) goto nopromote; - } *bounce = true; *read_full = promote_full; @@ -526,7 +531,7 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_rbio_narrow_crcs(trans, rbio)); } @@ -637,12 +642,17 @@ csum_err: goto out; } + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + bch_err_inum_offset_ratelimited(ca, rbio->read_pos.inode, rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + "data %s", buf.buf); + printbuf_exit(&buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 8c8cb1541a..2c098ac017 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans, i_sectors_delta) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc); if (unlikely(ret)) return ret; @@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, - op->opts.background_target, - op->opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -396,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - const struct bch_extent_ptr *ptr; struct bch_write_bio *n; - struct bch_dev *ca; BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || - !c->devs[ptr->dev]); + BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, @@ -1109,16 +1104,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, static inline void bch2_nocow_write_unlock(struct bch_write_op *op) { struct bch_fs *c = op->c; - const struct bch_extent_ptr *ptr; - struct bkey_i *k; for_each_keylist_key(&op->insert_keys, k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); bkey_for_each_ptr(ptrs, ptr) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); } } @@ -1128,25 +1121,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct bkey_s_c k, u64 new_i_size) { - struct bkey_i *new; - struct bkey_ptrs ptrs; - struct bch_extent_ptr *ptr; - int ret; - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { /* trace this */ return 0; } - new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; @@ -1167,16 +1155,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i *orig; - struct bkey_s_c k; - int ret; for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, ({ + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1228,10 +1212,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - struct bucket_to_lock *i; u32 snapshot; struct bucket_to_lock *stale_at; int ret; @@ -1273,7 +1254,7 @@ retry: break; /* Get iorefs before dropping btree locks: */ - ptrs = bch2_bkey_ptrs_c(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { struct bpos b = PTR_BUCKET_POS(c, ptr); struct nocow_lock_bucket *l = @@ -1464,6 +1445,11 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1578,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8cf238be62..bc890776eb 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -26,6 +27,47 @@ static const char * const bch2_journal_errors[] = { NULL }; +static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) +{ + union journal_res_state s = READ_ONCE(j->reservations); + unsigned i = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + i; + + prt_printf(out, "seq:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "size:"); + prt_tab(out); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) + bch2_journal_buf_to_text(out, j, seq); +} + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -155,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static void __journal_entry_close(struct journal *j, unsigned closed_val) +static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -184,6 +226,18 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (trace_journal_entry_close_enabled() && trace) { + struct printbuf pbuf = PRINTBUF; + pbuf.atomic++; + + prt_str(&pbuf, "entry size: "); + prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); + prt_newline(&pbuf); + bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); + trace_journal_entry_close(c, pbuf.buf); + printbuf_exit(&pbuf); + } + sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; BUG_ON(sectors > buf->sectors); @@ -222,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); @@ -236,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j) /* Don't close it yet if we already have a write in flight: */ if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); @@ -330,6 +384,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -363,11 +418,6 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - if (j->res_get_blocked_start) - bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - mod_delayed_work(c->io_complete_wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); @@ -407,7 +457,7 @@ static void journal_write_work(struct work_struct *work) if (delta > 0) mod_delayed_work(c->io_complete_wq, &j->write_work, delta); else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: spin_unlock(&j->lock); } @@ -464,18 +514,23 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j); - if (ret == JOURNAL_ERR_max_in_flight) - trace_and_count(c, journal_entry_full, c); -unlock: - if ((ret && ret != JOURNAL_ERR_insufficient_devices) && - !j->res_get_blocked_start) { - j->res_get_blocked_start = local_clock() ?: 1; - trace_and_count(c, journal_full, c); - } + if (ret == JOURNAL_ERR_max_in_flight) { + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, true); + if (trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + } + count_event(c, journal_entry_full); + } +unlock: can_discard = j->can_discard; spin_unlock(&j->lock); @@ -553,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -610,7 +665,7 @@ recheck_need_open: struct journal_res res = { 0 }; if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); @@ -774,6 +829,48 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret = NULL; + + mutex_lock(&j->buf_lock); + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + + ret = journal_state_count(s, idx) + ? ERR_PTR(-EAGAIN) + : buf; + break; + } + } + + spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -955,8 +1052,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, break; } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); unlock: up_write(&c->state_lock); return ret; @@ -986,17 +1082,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); err: - if (ret) - bch_err_fn(ca, ret); + bch_err_fn(ca, ret); return ret; } int bch2_fs_journal_alloc(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->journal.nr) continue; @@ -1225,6 +1317,7 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; + mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); @@ -1260,10 +1353,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; - struct bch_dev *ca; unsigned long now = jiffies; - u64 seq; - unsigned i; + u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -1275,20 +1366,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "average write size:\t"); + prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); + prt_newline(out); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { @@ -1304,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_newline(out); - - for (seq = journal_cur_seq(j); - seq >= journal_last_unwritten_seq(j); - --seq) { - i = seq & JOURNAL_BUF_MASK; - - prt_printf(out, "unwritten entry:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); - - prt_printf(out, "sectors:"); - prt_tab(out); - prt_printf(out, "%u", j->buf[i].sectors); - prt_newline(out); - - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } + prt_printf(out, "unwritten entries:"); + prt_newline(out); + bch2_journal_bufs_to_text(out, j); prt_printf(out, "replay done:\t\t%i\n", @@ -1352,8 +1420,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->space[journal_space_total].next_entry, j->space[journal_space_total].total); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1362,7 +1429,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", i); + prt_printf(out, "dev %u:\n", ca->dev_idx); prt_printf(out, "\tnr\t\t%u\n", ja->nr); prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2f768e11ae..4544ce24bb 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); - closure_wake_up(&j->preres_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) @@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res, static inline bool journal_entry_empty(struct jset *j) { - struct jset_entry *i; - if (j->seq != j->last_seq) return false; @@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 6ab756a485..47805193f1 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset) }}; } -static bool jset_csum_good(struct bch_fs *c, struct jset *j) +static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) { - return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && - !bch2_crc_cmp(j->csum, - csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); + if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { + *csum = (struct bch_csum) {}; + return false; + } + + *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + return !bch2_crc_cmp(j->csum, *csum); } static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) @@ -678,17 +683,12 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { - if (i < BCH_DATA_NR) - prt_printf(out, " %s", bch2_data_types[i]); - else - prt_printf(out, " (unknown data type %u)", i); + bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } - - prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); } static int journal_entry_log_validate(struct bch_fs *c, @@ -725,6 +725,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -768,7 +784,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bkey_invalid_flags flags) { - struct jset_entry *entry; unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -920,6 +935,7 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; + struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -952,7 +968,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - return 0; + goto out; } j = buf->data; @@ -969,12 +985,12 @@ reread: ret = journal_read_buf_realloc(buf, vstruct_bytes(j)); if (ret) - return ret; + goto err; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - return 0; + goto out; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -983,7 +999,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - return ret; + goto err; } /* @@ -993,20 +1009,28 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; + goto out; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - csum_good = jset_csum_good(c, j); + enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); + struct bch_csum csum; + csum_good = jset_csum_good(c, j, &csum); + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "journal checksum error")) + "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf))) saw_bad = true; ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, vstruct_end(j) - (void *) j->encrypted_start); bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret); + "error decrypting journal entry: %s", + bch2_err_str(ret)); mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { @@ -1025,7 +1049,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - return ret; + goto err; } next_block: pr_debug("next"); @@ -1034,7 +1058,11 @@ next_block: j = ((void *) j) + (sectors << 9); } - return 0; +out: + ret = 0; +err: + printbuf_exit(&err); + return ret; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1156,8 +1184,6 @@ int bch2_journal_read(struct bch_fs *c, struct journal_list jlist; struct journal_replay *i, **_i, *prev = NULL; struct genradix_iter radix_iter; - struct bch_dev *ca; - unsigned iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; u64 seq; @@ -1168,7 +1194,7 @@ int bch2_journal_read(struct bch_fs *c, jlist.last_seq = 0; jlist.ret = 0; - for_each_member_device(ca, c, iter) { + for_each_member_device(c, ca) { if (!c->opts.fsck && !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; @@ -1334,7 +1360,7 @@ int bch2_journal_read(struct bch_fs *c, continue; for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); if (!i->ptrs[ptr].csum_good) bch_err_dev_offset(ca, i->ptrs[ptr].sector, @@ -1452,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); retry: @@ -1500,11 +1528,13 @@ done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -EROFS; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1512,6 +1542,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1604,6 +1639,9 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, false); + closure_wake_up(&w->wait); journal_wake(j); @@ -1656,7 +1694,6 @@ static CLOSURE_CALLBACK(do_journal_write) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_extent_ptr *ptr; struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); @@ -1700,11 +1737,13 @@ static CLOSURE_CALLBACK(do_journal_write) static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset_entry *start, *end; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1715,7 +1754,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - vstruct_for_each_safe(jset, i, next) { + vstruct_for_each(jset, i) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1732,40 +1771,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1788,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1847,7 +1886,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { - w->noflush = true; + w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; w->last_seq = 0; @@ -1866,12 +1905,11 @@ CLOSURE_CALLBACK(bch2_journal_write) { closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; - unsigned i, nr_rw_members = 0; + unsigned nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1884,12 +1922,16 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); + mutex_unlock(&j->buf_lock); if (ret) goto err; + j->entry_bytes_written += vstruct_bytes(w->data); + while (1) { spin_lock(&j->lock); ret = journal_write_alloc(j, w); @@ -1927,7 +1969,7 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) nr_rw_members++; if (nr_rw_members > 1) @@ -1944,7 +1986,7 @@ CLOSURE_CALLBACK(bch2_journal_write) goto err; if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ec712104ad..c33dca6415 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j, bool low_on_space) +void bch2_journal_set_watermark(struct journal *j) { - unsigned watermark = BCH_WATERMARK_stripe; - - if (low_on_space) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool low_on_space = j->space[journal_space_clean].total * 4 <= + j->space[journal_space_total].total; + bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb + ? BCH_WATERMARK_reclaim + : BCH_WATERMARK_stripe; + + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], + &j->low_on_space_start, low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) + trace_and_count(c, journal_full, c); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned i, pos, nr_devs = 0; + unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr) continue; @@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; unsigned clean, clean_ondisk, total; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); - unsigned i, nr_online = 0, nr_devs_want; + unsigned nr_online = 0, nr_devs_want; bool can_discard = false; int ret = 0; lockdep_assert_held(&j->lock); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) @@ -201,14 +205,14 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; - if (nr_online < c->opts.metadata_replicas_required) { + if (nr_online < metadata_replicas_required(c)) { ret = JOURNAL_ERR_insufficient_devices; goto out; } nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - for (i = 0; i < journal_space_nr; i++) + for (unsigned i = 0; i < journal_space_nr; i++) j->space[i] = __journal_space_available(j, nr_devs_want, i); clean_ondisk = j->space[journal_space_clean_ondisk].total; @@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j, clean * 4 <= total); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) void bch2_journal_do_discards(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned iter; mutex_lock(&j->discard_lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { @@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; @@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) return JOURNAL_PIN_other; } -void bch2_journal_pin_set(struct journal *j, u64 seq, +static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) + journal_pin_flush_fn flush_fn, + enum journal_pin_type type) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + /* + * flush_fn is how we identify journal pins in debugfs, so must always + * exist, even if it doesn't do anything: + */ + BUG_ON(!flush_fn); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + list_add(&pin->list, &pin_list->list[type]); +} + +void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list; bool reclaim; spin_lock(&j->lock); + u64 seq = READ_ONCE(src->seq); + if (seq < journal_last_seq(j)) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on @@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, return; } - pin_list = journal_seq_pin(j, seq); + reclaim = __journal_pin_drop(j, dst); - reclaim = __journal_pin_drop(j, pin); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); - if (flush_fn) - list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); - else - list_add(&pin->list, &pin_list->flushed); + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + bool reclaim; + + spin_lock(&j->lock); + + BUG_ON(seq < journal_last_seq(j)); + + reclaim = __journal_pin_drop(j, pin); + + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j, static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; u64 seq_to_flush = 0; - unsigned iter; spin_lock(&j->lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j) p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating journal reclaim thread"); + bch_err_msg(c, ret, "creating journal reclaim thread"); + if (ret) return ret; - } get_task_struct(p); j->reclaim_thread = p; @@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { + /* time_stats this */ bool did_work = false; if (!test_bit(JOURNAL_STARTED, &j->flags)) @@ -854,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) journal_seq_pin(j, seq)->devs); seq++; - spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, &replicas.e); - spin_lock(&j->lock); + if (replicas.e.nr_devs) { + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } } spin_unlock(&j->lock); err: diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 494d1a6edd..ec84c33452 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) @@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, bch2_journal_pin_set(j, seq, pin, flush_fn); } -static inline void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - /* Guard against racing with journal_pin_drop(src): */ - u64 seq = READ_ONCE(src->seq); - - if (seq) - bch2_journal_pin_add(j, seq, dst, flush_fn); -} +void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_update(struct journal *j, u64 seq, struct journal_entry_pin *pin, diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index f9d9aa95bf..0200e299cf 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -267,7 +267,7 @@ retry: while (!(ret = PTR_ERR_OR_ZERO(b)) && b && - !test_bit(BCH_FS_STOPPING, &c->flags)) + !test_bit(BCH_FS_stopping, &c->flags)) b = bch2_btree_iter_next_node(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a756b69582..38817c7a08 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -182,6 +183,12 @@ struct journal { darray_u64 early_journal_entries; /* + * Protects journal_buf->data, when accessing without a jorunal + * reservation: for synchronization between the btree write buffer code + * and the journal write path: + */ + struct mutex buf_lock; + /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ @@ -195,7 +202,6 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -262,15 +268,19 @@ struct journal { unsigned long last_flush_write; - u64 res_get_blocked_start; u64 write_start_time; u64 nr_flush_writes; u64 nr_noflush_writes; + u64 entry_bytes_written; + + u64 low_on_space_start; + u64 low_on_pin_start; + u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *blocked_time; struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c index 5699cd4873..1b828bddd1 100644 --- a/fs/bcachefs/keylist.c +++ b/fs/bcachefs/keylist.c @@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_verify_keylist_sorted(struct keylist *l) { - struct bkey_i *k; - for_each_keylist_key(l, k) BUG_ON(bkey_next(k) != l->top && bpos_ge(k->k.p, bkey_next(k)->k.p)); diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h index fe759c7031..e687e0e9ae 100644 --- a/fs/bcachefs/keylist.h +++ b/fs/bcachefs/keylist.h @@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) } #define for_each_keylist_key(_keylist, _k) \ - for (_k = (_keylist)->keys; \ + for (struct bkey_i *_k = (_keylist)->keys; \ _k != (_keylist)->top; \ _k = bkey_next(_k)) static inline u64 keylist_sectors(struct keylist *keys) { - struct bkey_i *k; u64 ret = 0; for_each_keylist_key(keys, k) ret += k->k.size; - return ret; } diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 8640f7dee0..ad598105c5 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, int bch2_resume_logged_ops(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_logged_ops, POS_MIN, + BTREE_ITER_PREFETCH, k, resume_logged_op(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { - return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_logged_op_start(trans, k)); } void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) { - int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); /* * This needs to be a fatal error because we've left an unfinished diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h new file mode 100644 index 0000000000..6a4bf7129d --- /dev/null +++ b/fs/bcachefs/logged_ops_format.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H +#define _BCACHEFS_LOGGED_OPS_FORMAT_H + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a5cc0ed195..7a4ca5a28b 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -147,18 +147,13 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = POS_MIN; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index 1f0801e2e5..bf0ef668fd 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div); /** * mean_and_variance_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_get_mean(struct mean_and_variance s) { @@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); /** * mean_and_variance_get_variance() - get variance from @s1 + * @s1: mean and variance number of samples and sums * * see linked pdf equation 12. */ @@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); /** * mean_and_variance_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_get_stddev(struct mean_and_variance s) { @@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); /** * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s1: .. - * @s2: .. + * @s: mean and variance number of samples and their sums + * @x: new value to include in the &mean_and_variance_weighted * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. @@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) { @@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s + * @s: mean and variance number of samples and their sums */ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) { @@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) { diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 056e797383..64df11ab42 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -12,6 +12,9 @@ /* * u128_u: u128 user mode, because not all architectures support a real int128 * type + * + * We don't use this version in userspace, because in userspace we link with + * Rust and rustc has issues with u128. */ #if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC) diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 019583c3ca..51093fa848 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -130,20 +130,8 @@ static void mean_and_variance_test_1(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - /* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_3(struct kunit *test) +static void mean_and_variance_test_2(struct kunit *test) { s64 d[] = { 100, 100, 100, 100, 100 }; s64 mean[] = { 22, 32, 40, 46, 50 }; @@ -155,18 +143,6 @@ static void mean_and_variance_test_3(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_4(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 10, 11, 12, 13, 14 }; - s64 stddev[] = { 9, 13, 15, 17, 19 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - static void mean_and_variance_fast_divpow2(struct kunit *test) { s64 i; @@ -224,8 +200,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_2), - KUNIT_CASE(mean_and_variance_test_3), - KUNIT_CASE(mean_and_variance_test_4), {} }; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index e3a51f6d6c..5623cee3ef 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; int ret = 0; @@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) break; @@ -145,10 +143,9 @@ retry: continue; } - if (ret) { - bch_err_msg(c, ret, "updating btree node key"); + bch_err_msg(c, ret, "updating btree node key"); + if (ret) break; - } next: bch2_btree_iter_next_node(&iter); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index f3dac4511a..bf68ea4944 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -6,9 +6,11 @@ #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -27,12 +29,53 @@ #include <linux/ioprio.h> #include <linux/kthread.h> -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +const char * const bch2_data_ops_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_OPS() +#undef x + NULL +}; + +static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:"); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs: "); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target: "); + prt_tab(out); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression: "); + prt_tab(out); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas: "); + prt_tab(out); + prt_u64(out, data_opts->extra_replicas); +} + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (trace_move_extent_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); trace_move_extent(c, buf.buf); printbuf_exit(&buf); } @@ -63,7 +106,7 @@ struct moving_io { struct data_update write; /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; + struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -104,6 +147,15 @@ static void move_write(struct moving_io *io) return; } + if (trace_move_extent_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_move_extent_write(c, buf.buf); + printbuf_exit(&buf); + } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); @@ -211,7 +263,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) trace_move_data(c, stats); } -void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) { memset(stats, 0, sizeof(*stats)); stats->data_type = BCH_DATA_user; @@ -234,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k, &io_opts, &data_opts); + if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -342,7 +395,8 @@ err: bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + count_event(c, move_extent_start_fail); + if (trace_move_extent_start_fail_enabled()) { struct printbuf buf = PRINTBUF; @@ -364,13 +418,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, int ret = 0; if (io_opts->cur_inum != extent_k.k->p.inode) { - struct btree_iter iter; - struct bkey_s_c k; - io_opts->d.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -383,11 +434,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - ret = darray_push(&io_opts->d, e); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + darray_push(&io_opts->d, e); + })); io_opts->cur_inum = extent_k.k->p.inode; } @@ -395,12 +443,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (extent_k.k->p.snapshot) { - struct snapshot_io_opts_entry *i; + if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) return &i->io_opts; - } return &io_opts->fs_io_opts; } @@ -628,7 +674,7 @@ int bch2_move_data(struct bch_fs *c, return ret; } -int __bch2_evacuate_bucket(struct moving_context *ctxt, +int bch2_evacuate_bucket(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) @@ -664,21 +710,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); - if (ret) { - bch_err_msg(c, ret, "looking up alloc key"); + bch_err_msg(c, ret, "looking up alloc key"); + if (ret) goto err; - } a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = a->dirty_sectors; + dirty_sectors = bch2_bucket_sectors_dirty(*a); bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; fragmentation = a->fragmentation_lru; - ret = bch2_btree_write_buffer_flush(trans); - if (ret) { - bch_err_msg(c, ret, "flushing btree write buffer"); + ret = bch2_btree_write_buffer_tryflush(trans); + bch_err_msg(c, ret, "flushing btree write buffer"); + if (ret) goto err; - } while (!(ret = bch2_move_ratelimit(ctxt))) { if (is_kthread && kthread_should_stop()) @@ -697,9 +741,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, break; if (!bp.level) { - const struct bch_extent_ptr *ptr; - unsigned i = 0; - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -722,6 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned i = 0; bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { if (ptr->dev == bucket.inode) { data_opts.rewrite_ptrs |= 1U << i; @@ -763,6 +805,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + unsigned sectors = btree_ptr_sectors_written(&b->key); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -772,11 +816,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, - c->opts.btree_node_size >> 9); + bch2_ratelimit_increment(ctxt->rate, sectors); if (ctxt->stats) { - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); } } next: @@ -789,31 +832,13 @@ err: return ret; } -int bch2_evacuate_bucket(struct bch_fs *c, - struct bpos bucket, int gen, - struct data_update_opts data_opts, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - struct moving_context ctxt; - int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, + struct bbpos start, + struct bbpos end, move_btree_pred pred, void *arg, struct bch_move_stats *stats) { @@ -823,7 +848,7 @@ static int bch2_move_btree(struct bch_fs *c, struct btree_trans *trans; struct btree_iter iter; struct btree *b; - enum btree_id id; + enum btree_id btree; struct data_update_opts data_opts; int ret = 0; @@ -834,15 +859,15 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); - id++) { - stats->pos = BBPOS(id, POS_MIN); + for (btree = start.btree; + btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + btree ++) { + stats->pos = BBPOS(btree, POS_MIN); - if (!bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, btree)->b) continue; - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; @@ -852,8 +877,8 @@ retry: if (kthread && kthread_should_stop()) break; - if ((cmp_int(id, end_btree_id) ?: - bpos_cmp(b->key.k.p, end_pos)) > 0) + if ((cmp_int(btree, end.btree) ?: + bpos_cmp(b->key.k.p, end.pos)) > 0) break; stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -910,7 +935,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bch_ioctl_data *op = arg; unsigned i = 0; @@ -990,8 +1014,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) int ret; ret = bch2_move_btree(c, - 0, POS_MIN, - BTREE_ID_NR, SPOS_MAX, + BBPOS_MIN, + BBPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); @@ -1006,79 +1030,109 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) return ret; } +static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned durability = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= replicas) { + data_opts->kill_ptrs |= BIT(i); + durability -= d; + } + + i++; + } + + return data_opts->kill_ptrs != 0; +} + +static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) { + struct bbpos start = BBPOS(op.start_btree, op.start_pos); + struct bbpos end = BBPOS(op.end_btree, op.end_pos); int ret = 0; + if (op.op >= BCH_DATA_OP_NR) + return -EINVAL; + + bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + switch (op.op) { - case BCH_DATA_OP_REREPLICATE: - bch2_move_stats_init(stats, "rereplicate"); + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_MIGRATE: + case BCH_DATA_OP_migrate: if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - bch2_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_REWRITE_OLD_NODES: - bch2_move_stats_init(stats, "rewrite_old_nodes"); + case BCH_DATA_OP_rewrite_old_nodes: ret = bch2_scan_old_btree_nodes(c, stats); - bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_drop_extra_replicas: + ret = bch2_move_btree(c, start, end, + drop_extra_replicas_btree_pred, c, stats) ?: ret; + ret = bch2_move_data(c, start, end, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; } + bch2_move_stats_exit(stats, c); return ret; } void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - prt_printf(out, "%s: data type=%s pos=", - stats->name, - bch2_data_types[stats->data_type]); + prt_printf(out, "%s: data type==", stats->name); + bch2_prt_data_type(out, stats->data_type); + prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index c5a7aed2e1..9baf3093a6 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -75,6 +75,8 @@ do { \ typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); +extern const char * const bch2_data_ops_strs[]; + void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, @@ -134,23 +136,17 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct moving_context *, +int bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); -int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, - struct data_update_opts, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, char *); +void bch2_move_stats_init(struct bch_move_stats *, const char *); void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index a84e79f79e..69e06a84da 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; - b->sectors = a->dirty_sectors; + b->sectors = bch2_bucket_sectors_dirty(*a); ret = data_type_movable(a->data_type) && a->fragmentation_lru && @@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; move_buckets_wait(ctxt, buckets_in_flight, false); - ret = bch2_btree_write_buffer_flush(trans); - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + return ret; + + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", __func__, bch2_err_str(ret))) return ret; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ @@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, saw++; - if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); + if (ret2 < 0) + goto err; + + if (!ret2) not_movable++; else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; - if (ret2 >= 0) - sectors += b.sectors; + ret2 = darray_push(buckets, b); + if (ret2) + goto err; + sectors += b.sectors; } + + ret2 = buckets->nr >= nr_to_get; +err: ret2; })); @@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - struct move_bucket *i; u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; @@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt, break; } - ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, + ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, f->bucket.k.gen, data_opts); if (ret) goto err; @@ -259,19 +267,16 @@ err: */ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { - struct bch_dev *ca; - unsigned dev_idx; s64 wait = S64_MAX, fragmented_allowed, fragmented; - unsigned i; - for_each_rw_member(ca, c, dev_idx) { + for_each_rw_member(c, ca) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); fragmented = 0; - for (i = 0; i < BCH_DATA_NR; i++) + for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) fragmented += usage.d[i].fragmented; @@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg) if (!buckets) return -ENOMEM; ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) { kfree(buckets); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } @@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg) if (!c->copy_gc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled); + kthread_wait_freezable(c->copy_gc_enabled || + kthread_should_stop()); } if (unlikely(freezing(current))) { @@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c) t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ret = PTR_ERR_OR_ZERO(t); - if (ret) { - bch_err_msg(c, ret, "creating copygc thread"); + bch_err_msg(c, ret, "creating copygc thread"); + if (ret) return ret; - } get_task_struct(t); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8dd4046cca..b1ed0b9a20 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = { NULL }; -const char * const bch2_data_types[] = { +const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; @@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: not a multiple of 512", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { if (err) prt_printf(err, "%s: must be a power of two", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if (opt->fn.validate) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 8526f17745..9a4b7faa37 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const bch2_compression_types[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; -extern const char * const bch2_data_types[]; +extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_jset_entry_types[]; extern const char * const bch2_fs_usage_types[]; @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ @@ -394,7 +389,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ @@ -419,6 +414,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ + x(stdio, u64, \ + 0, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Pointer to a struct stdio_redirect") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ @@ -458,7 +458,13 @@ enum fsck_err_opts { OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") + "to have already been replicated n times") \ + x(btree_node_prefetch, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + " prefetched sequentially") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; @@ -558,6 +564,11 @@ struct bch_io_opts { #undef x }; +static inline unsigned background_compression(struct bch_io_opts opts) +{ + return opts.background_compression ?: opts.compression; +} + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index accf246c32..b27d229259 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) va_copy(args2, args); len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + va_end(args2); } while (len + 1 >= printbuf_remaining(out) && !bch2_printbuf_make_room(out, len + 1)); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a54647c36b..e68b34eab9 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -599,14 +599,9 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { - struct bch_sb_field_quota *sb_quota; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); return -BCH_ERR_ENOSPC_sb_quota; @@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - trans = bch2_trans_get(c); - - ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, - POS_MIN, BTREE_ITER_PREFETCH, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_inodes, - POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - bch2_fs_quota_read_inode(trans, &iter, k)); - - bch2_trans_put(trans); - - if (ret) - bch_err_fn(c, ret); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h new file mode 100644 index 0000000000..dc34347ef6 --- /dev/null +++ b/fs/bcachefs/quota_format.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_FORMAT_H +#define _BCACHEFS_QUOTA_FORMAT_H + +/* KEY_TYPE_quota: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index dd6fed2581..22d1017aa4 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -69,7 +69,7 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, __bch2_set_rebalance_needs_scan(trans, inum)); rebalance_wakeup(c); return ret; @@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, extent_entry_drop(bkey_i_to_s(n), (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, @@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, return bkey_s_c_null; } + if (trace_rebalance_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "target="); + bch2_target_to_text(&buf, c, r->target); + prt_str(&buf, " compression="); + bch2_compression_opt_to_text(&buf, r->compression); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, k); + + trace_rebalance_extent(c, buf.buf); + printbuf_exit(&buf); + } + return k; } @@ -239,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = io_opts->background_compression ?: io_opts->compression; + compression = background_compression(*io_opts); } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : - (io_opts->background_compression ?: io_opts->compression); + compression = r ? r->compression : background_compression(*io_opts); } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); @@ -273,7 +286,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); bch2_move_stats_exit(&r->scan_stats, trans->c); @@ -371,7 +384,6 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct moving_context ctxt; - int ret; set_freezable(); @@ -379,8 +391,7 @@ static int bch2_rebalance_thread(void *arg) writepoint_ptr(&c->rebalance_write_point), true); - while (!kthread_should_stop() && - !(ret = do_rebalance(&ctxt))) + while (!kthread_should_stop() && !do_rebalance(&ctxt)) ; bch2_moving_ctxt_exit(&ctxt); @@ -456,10 +467,9 @@ int bch2_rebalance_start(struct bch_fs *c) p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating rebalance thread"); + bch_err_msg(c, ret, "creating rebalance thread"); + if (ret) return ret; - } get_task_struct(p); rcu_assign_pointer(c->rebalance.thread, p); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 5cf7d05320..21e13bb433 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + if (k->overwritten) + return 0; + + trans->journal_res.seq = k->journal_seq; + /* * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing @@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) static int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key **keys_sorted, *k; + DARRAY(struct journal_key *) keys_sorted = { 0 }; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - size_t i; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); - if (!keys_sorted) - return -BCH_ERR_ENOMEM_journal_replay; - - for (i = 0; i < keys->nr; i++) - keys_sorted[i] = &keys->d[i]; - - sort(keys_sorted, keys->nr, - sizeof(keys_sorted[0]), - journal_sort_seq_cmp, NULL); - if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); @@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; + /* + * First, attempt to replay keys in sorted order. This is more + * efficient - better locality of btree access - but some might fail if + * that would cause a journal deadlock. + */ + for (size_t i = 0; i < keys->nr; i++) { + cond_resched(); + + struct journal_key *k = keys->d + i; + + /* Skip fastpath if we're low on space in the journal */ + ret = c->journal.watermark ? -1 : + commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + bch2_journal_replay_key(trans, k)); + BUG_ON(!ret && !k->overwritten); + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) + goto err; + } + } + /* + * Now, replay any remaining keys in the order in which they appear in + * the journal, unpinning those journal entries as we go: + */ + sort(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); + + darray_for_each(keys_sorted, kp) { cond_resched(); + struct journal_key *k = *kp; + replay_now_at(j, k->journal_seq); - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim - : 0), + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + (!k->allocated + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), bch2_journal_replay_key(trans, k)); - if (ret) { - bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + bch_err_msg(c, ret, "while replaying key at btree %s level %u:", + bch2_btree_id_str(k->btree_id), k->level); + if (ret) goto err; - } + + BUG_ON(!k->overwritten); } + /* + * We need to put our btree_trans before calling flush_all_pins(), since + * that will use a btree_trans internally + */ + bch2_trans_put(trans); + trans = NULL; + if (!c->opts.keep_journal) bch2_journal_keys_put_initial(c); @@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c) j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); - if (keys->nr && !ret) + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: - kvfree(keys_sorted); - - if (ret) - bch_err_fn(c, ret); + if (trans) + bch2_trans_put(trans); + darray_exit(&keys_sorted); + bch_err_fn(c, ret); return ret; } @@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, @@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); @@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c, static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean) { - struct jset_entry *entry; - int ret; - if (clean) { - for (entry = clean->start; + for (struct jset_entry *entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c, continue; vstruct_for_each(&i->j, entry) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -474,10 +497,9 @@ err: noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, __bch2_fs_upgrade_for_subvolumes(trans)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c) static int bch2_set_may_go_rw(struct bch_fs *c) { - set_bit(BCH_FS_MAY_GO_RW, &c->flags); + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); return 0; } @@ -542,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v) static bool check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -562,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c) new_version = latest_version; break; case BCH_VERSION_UPGRADE_none: - new_version = old_version; + new_version = min(old_version, latest_version); break; } } @@ -589,17 +625,15 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, new_version); prt_newline(&buf); - u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); - if (recovery_passes) { - if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) - prt_str(&buf, "fsck required"); - else { - prt_str(&buf, "running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); - } - - c->recovery_passes_explicit |= recovery_passes; - c->opts.fix_errors = FSCK_FIX_yes; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_upgrade(c, old_version, new_version); + passes = ext->recovery_passes_required[0] & ~passes; + + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } bch_info(c, "%s", buf.buf); @@ -625,7 +659,7 @@ u64 bch2_fsck_recovery_passes(void) static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -642,39 +676,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; int ret; - c->curr_recovery_pass = pass; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); - if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + return 0; +} - if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - printk(KERN_CONT " done\n"); +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; - c->recovery_passes_complete |= BIT_ULL(pass); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); } - return 0; + return ret; } -static int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; continue; + } if (ret) break; - c->curr_recovery_pass++; } return ret; @@ -718,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->opts.nochanges && c->opts.norecovery)) { + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -748,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (bch2_check_version_downgrade(c)) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:\n"); + prt_str(&buf, "Version downgrade required:"); __le64 passes = ext->recovery_passes_required[0]; bch2_sb_set_downgrade(c, @@ -756,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c) BCH_VERSION_MINOR(c->sb.version)); passes = ext->recovery_passes_required[0] & ~passes; if (passes) { - prt_str(&buf, " running recovery passes: "); + prt_str(&buf, "\n running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } @@ -779,6 +836,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + if (c->opts.fsck) + set_bit(BCH_FS_fsck_running, &c->flags); + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -919,13 +979,17 @@ use_clean: if (ret) goto err; + clear_bit(BCH_FS_fsck_running, &c->flags); + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && - !test_bit(BCH_FS_ERROR, &c->flags)) { + test_bit(BCH_FS_errors_fixed, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags) && + !test_bit(BCH_FS_error, &c->flags)) { + bch2_flush_fsck_errs(c); + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + clear_bit(BCH_FS_errors_fixed, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; @@ -933,13 +997,13 @@ use_clean: if (ret) goto err; - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || - test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { @@ -958,13 +1022,13 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags) && + if (!test_bit(BCH_FS_error, &c->flags) && !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_error, &c->flags)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (ext && (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || @@ -976,8 +1040,8 @@ use_clean: } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); write_sb = true; @@ -993,8 +1057,12 @@ use_clean: bch2_move_stats_init(&stats, "recovery"); - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c) ?: + struct printbuf buf = PRINTBUF; + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); + printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; @@ -1007,7 +1075,6 @@ use_clean: ret = 0; out: - set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && @@ -1015,13 +1082,14 @@ out: bch2_journal_keys_put_initial(c); kfree(clean); - if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + if (!ret && + test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && + !c->opts.nochanges) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; err: fsck_err: @@ -1034,8 +1102,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - struct bch_dev *ca; - unsigned i; int ret; bch_notice(c, "initializing new filesystem"); @@ -1054,13 +1120,12 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_usage_init(ca); ret = bch2_fs_journal_alloc(c); @@ -1088,7 +1153,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) ca->new_fs_bucket_idx = 0; ret = bch2_fs_freespace_init(c); @@ -1112,10 +1177,9 @@ int bch2_fs_initialize(struct bch_fs *c) packed_inode.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "creating root directory"); + bch_err_msg(c, ret, "creating root directory"); + if (ret) goto err; - } bch2_inode_init_early(c, &lostfound_inode); @@ -1126,10 +1190,11 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { 0 }, 0)); - if (ret) { - bch_err_msg(c, ret, "creating lost+found"); + bch_err_msg(c, ret, "creating lost+found"); + if (ret) goto err; - } + + c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1138,10 +1203,9 @@ int bch2_fs_initialize(struct bch_fs *c) } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err_msg(c, ret, "writing first journal entry"); + bch_err_msg(c, ret, "writing first journal entry"); + if (ret) goto err; - } mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); @@ -1152,6 +1216,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - bch_err_fn(ca, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3a554b0751..4e9d24719b 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_online_recovery_passes(struct bch_fs *); u64 bch2_fsck_recovery_passes(void); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index d37c6fd30e..fa0c8efd2a 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -6,6 +6,7 @@ #define PASS_FSCK BIT(1) #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) /* * Passes may be reordered, but the second field is a persistent identifier and @@ -22,18 +23,18 @@ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_FSCK) \ - x(check_lrus, 11, PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_FSCK) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ - x(check_snapshot_trees, 18, PASS_FSCK) \ - x(check_snapshots, 19, PASS_FSCK) \ - x(check_subvols, 20, PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_FSCK) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ @@ -41,8 +42,8 @@ x(check_indirect_extents, 26, PASS_FSCK) \ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_FSCK) \ - x(check_directory_structure, 30, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 37d16e04e6..c47c66c2b3 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "io_misc.h" @@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && - le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { - prt_printf(err, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); - return -EINVAL; - } - - return 0; + bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), + c, err, reflink_p_front_pad_bad, + "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); +fsck_err: + return ret; } void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, @@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return true; } +static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(bkey_i_to_s(k)); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 start = le64_to_cpu(p.v->idx); + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + u64 next_idx = end + le32_to_cpu(p.v->back_pad); + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + if (next_idx <= start) { + bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); + } else if (*idx >= end) { + bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); + } else { + bkey_error_init(update); + update->k.p = p.k->p; + update->k.p.offset = next_idx; + update->k.size = next_idx - *idx; + set_bkey_val_u64s(&update->k, 0); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int __trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; + + u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + while (idx < end && !ret) + ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); + } + + if (flags & BTREE_TRIGGER_GC) { + size_t l = 0, r = c->reflink_gc_nr; + + while (l < r) { + size_t m = l + (r - l) / 2; + struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); + } + + return ret; +} + +int bch2_trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) +{ + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) { + struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); +} + /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) { if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - set_bkey_val_u64s(&new->k, 0);; + new.k->type = KEY_TYPE_deleted; + new.k->size = 0; + set_bkey_val_u64s(new.k, 0); *flags &= ~BTREE_TRIGGER_INSERT; } } -int bch2_trans_mark_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - check_indirect_extent_deleting(new, &flags); - - if (old.k->type == KEY_TYPE_reflink_v && - new->k.type == KEY_TYPE_reflink_v && - old.k->u64s == new->k.u64s && - !memcmp(bkey_s_c_to_reflink_v(old).v->start, - bkey_i_to_reflink_v(new)->v.start, - bkey_val_bytes(&new->k) - 8)) - return 0; + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) + check_indirect_extent_deleting(new, &flags); - return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); + return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ @@ -152,9 +324,9 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } -int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, +int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, + struct bkey_s_c old, struct bkey_s new, unsigned flags) { check_indirect_extent_deleting(new, &flags); @@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - refcount = bkey_refcount(r_v); + refcount = bkey_refcount(bkey_i_to_s(r_v)); *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); @@ -314,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ + ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -366,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, @@ -398,7 +575,7 @@ s64 bch2_remap_range(struct bch_fs *c, inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); } bch2_trans_iter_exit(trans, &inode_iter); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ccf3f9c49..4d88672897 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ - .trans_trigger = bch2_trans_mark_reflink_p, \ - .atomic_trigger = bch2_mark_reflink_p, \ + .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) @@ -23,15 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_reflink_v, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) @@ -39,15 +39,15 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ }) @@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) } } -static inline __le64 *bkey_refcount(struct bkey_i *k) +static inline __le64 *bkey_refcount(struct bkey_s k) { - switch (k->k.type) { + switch (k.k->type) { case KEY_TYPE_reflink_v: - return &bkey_i_to_reflink_v(k)->v.refcount; + return &bkey_s_to_reflink_v(k).v->refcount; case KEY_TYPE_indirect_inline_data: - return &bkey_i_to_indirect_inline_data(k)->v.refcount; + return &bkey_s_to_indirect_inline_data(k).v->refcount; default: return NULL; } diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h new file mode 100644 index 0000000000..6772eebb1f --- /dev/null +++ b/fs/bcachefs/reflink_format.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_FORMAT_H +#define _BCACHEFS_REFLINK_FORMAT_H + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1c4a8f5c92..cc2672c120 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -17,7 +17,7 @@ static int bch2_memcmp(const void *l, const void *r, size_t size) /* Replicas tracking - in memory: */ -static void verify_replicas_entry(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG unsigned i; @@ -32,7 +32,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -45,36 +45,26 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) + struct bch_replicas_entry_v1 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } -int bch2_replicas_entry_validate(struct bch_replicas_entry *r, +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) { @@ -104,7 +94,7 @@ bad: void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_cpu_replicas_entry(r, e) { @@ -117,7 +107,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, } static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -137,7 +127,7 @@ static void extent_to_replicas(struct bkey_s_c k, } static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; @@ -150,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -void bch2_bkey_to_replicas(struct bch_replicas_entry *e, +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, struct bkey_s_c k) { e->nr_devs = 0; @@ -175,12 +165,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, bch2_replicas_entry_sort(e); } -void bch2_devlist_to_replicas(struct bch_replicas_entry *e, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, enum bch_data_type data_type, struct bch_devs_list devs) { - unsigned i; - BUG_ON(!data_type || data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); @@ -189,8 +177,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, e->nr_devs = 0; e->nr_required = 1; - for (i = 0; i < devs.nr; i++) - e->devs[e->nr_devs++] = devs.devs[i]; + darray_for_each(devs, i) + e->devs[e->nr_devs++] = *i; bch2_replicas_entry_sort(e); } @@ -198,7 +186,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { unsigned i; struct bch_replicas_cpu new = { @@ -231,7 +219,7 @@ cpu_replicas_add_entry(struct bch_fs *c, } static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { int idx, entry_size = replicas_entry_bytes(search); @@ -249,7 +237,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, } int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bch2_replicas_entry_sort(search); @@ -257,13 +245,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c, } static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { return __replicas_entry_idx(r, search) >= 0; } bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bool marked; @@ -380,7 +368,7 @@ err: static unsigned reserve_journal_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned journal_res_u64s = 0; /* nr_inodes: */ @@ -405,7 +393,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c, noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new_r, new_gc; int ret = 0; @@ -470,7 +458,7 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) ? 0 : bch2_mark_replicas_slowpath(c, r); @@ -521,7 +509,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); @@ -596,7 +584,7 @@ retry: } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || @@ -627,7 +615,7 @@ retry: } int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, u64 sectors) { int ret, idx = bch2_replicas_entry_idx(c, r); @@ -660,7 +648,7 @@ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry *e, *dst; + struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -698,7 +686,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } - entry_size += sizeof(struct bch_replicas_entry) - + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); @@ -709,7 +697,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry *dst = + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); dst->data_type = e->data_type; @@ -753,7 +741,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry *src; + struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); @@ -791,7 +779,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *dst, *src; + struct bch_replicas_entry_v1 *dst, *src; bool need_v1 = false; size_t bytes; @@ -842,7 +830,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); int ret = bch2_replicas_entry_validate(e, sb, err); @@ -850,7 +838,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, return ret; if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry *n = + struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); @@ -887,7 +875,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { @@ -949,7 +937,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool ret = true; percpu_down_read(&c->mark_lock); @@ -1009,7 +997,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { - struct bch_replicas_entry *r; + struct bch_replicas_entry_v1 *r; for_each_replicas_entry(replicas, r) for (i = 0; i < r->nr_devs; i++) diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index f70a642775..654a4b26d3 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -6,28 +6,28 @@ #include "eytzinger.h" #include "replicas_types.h" -void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry *); -int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *); +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, struct bch_sb *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -static inline struct bch_replicas_entry * +static inline struct bch_replicas_entry_v1 * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); -void bch2_devlist_to_replicas(struct bch_replicas_entry *, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) @@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d) int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); -void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, unsigned dev) { e->data_type = BCH_DATA_cached; @@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *, u64); #define for_each_cpu_replicas_entry(_r, _i) \ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h index 5cfff489bb..ac90d142c4 100644 --- a/fs/bcachefs/replicas_types.h +++ b/fs/bcachefs/replicas_types.h @@ -5,12 +5,12 @@ struct bch_replicas_cpu { unsigned nr; unsigned entry_size; - struct bch_replicas_entry *entries; + struct bch_replicas_entry_v1 *entries; }; struct replicas_delta { s64 delta; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct replicas_delta_list { @@ -21,7 +21,7 @@ struct replicas_delta_list { u64 nr_inodes; u64 persistent_reserved[BCH_REPLICAS_MAX]; struct {} memset_end; - struct replicas_delta d[0]; + struct replicas_delta d[]; }; #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index c76ad8ea5e..b6bf0ebe7e 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - struct bch_dev *ca; - unsigned i, dev; - percpu_down_read(&c->mark_lock); if (!journal_seq) { - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); } else { bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); @@ -210,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); + u->v = cpu_to_le64(c->usage_base->b.nr_inodes); } { @@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - for (i = 0; i < BCH_REPLICAS_MAX; i++) { + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = container_of(jset_entry_init(end, sizeof(*u)), struct jset_entry_usage, entry); @@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), @@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, "embedded variable length struct"); } - for_each_member_device(ca, c, dev) { + for_each_member_device(c, ca) { unsigned b = sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; struct jset_entry_dev_usage *u = @@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_dev_usage, entry); u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(dev); - u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->dev = cpu_to_le32(ca->dev_idx); - for (i = 0; i < BCH_DATA_NR; i++) { + for (unsigned i = 0; i < BCH_DATA_NR; i++) { u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); @@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_up_read(&c->mark_lock); - for (i = 0; i < 2; i++) { + for (unsigned i = 0; i < 2; i++) { struct jset_entry_clock *clock = container_of(jset_entry_init(end, sizeof(*clock)), struct jset_entry_clock, entry); diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c index 02a996e06a..7dc898761b 100644 --- a/fs/bcachefs/counters.c +++ b/fs/bcachefs/sb-counters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" -#include "counters.h" +#include "sb-counters.h" /* BCH_SB_FIELD_counters */ diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h index 4778aa19bf..81f8aec9fc 100644 --- a/fs/bcachefs/counters.h +++ b/fs/bcachefs/sb-counters.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COUNTERS_H -#define _BCACHEFS_COUNTERS_H +#ifndef _BCACHEFS_SB_COUNTERS_H +#define _BCACHEFS_SB_COUNTERS_H #include "bcachefs.h" #include "super-io.h" - int bch2_sb_counters_to_cpu(struct bch_fs *); int bch2_sb_counters_from_cpu(struct bch_fs *); @@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -#endif // _BCACHEFS_COUNTERS_H +#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h new file mode 100644 index 0000000000..62ea478215 --- /dev/null +++ b/fs/bcachefs/sb-counters_format.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H +#define _BCACHEFS_SB_COUNTERS_FORMAT_H + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 4919237bbe..441dcb1bf1 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -12,33 +12,105 @@ #include "sb-errors.h" #include "super-io.h" +#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) + /* - * Downgrade table: - * When dowgrading past certain versions, we need to run certain recovery passes - * and fix certain errors: + * Upgrade, downgrade tables - run certain recovery passes, fix certain errors * * x(version, recovery_passes, errors...) */ +#define UPGRADE_TABLE() \ + x(backpointers, \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_skiplists, \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ + BCH_FSCK_ERR_snapshot_bad_depth, \ + BCH_FSCK_ERR_snapshot_bad_skiplist) \ + x(deleted_inodes, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ + x(rebalance_work, \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) #define DOWNGRADE_TABLE() -struct downgrade_entry { +struct upgrade_downgrade_entry { u64 recovery_passes; u16 version; u16 nr_errors; const u16 *errors; }; -#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ }; +#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; +UPGRADE_TABLE() +#undef x + +static const struct upgrade_downgrade_entry upgrade_table[] = { +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver,\ + .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ + .errors = upgrade_##ver##_errors, \ +}, +UPGRADE_TABLE() +#undef x +}; + +void bch2_sb_set_upgrade(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + for (const struct upgrade_downgrade_entry *i = upgrade_table; + i < upgrade_table + ARRAY_SIZE(upgrade_table); + i++) + if (i->version > old_version && i->version <= new_version) { + u64 passes = i->recovery_passes; + + if (passes & RECOVERY_PASS_ALL_FSCK) + passes |= bch2_fsck_recovery_passes(); + passes &= ~RECOVERY_PASS_ALL_FSCK; + + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(passes)); + + for (const u16 *e = i->errors; + e < i->errors + i->nr_errors; + e++) { + __set_bit(*e, c->sb.errors_silent); + ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64)); + } + } +} + +#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x -static const struct downgrade_entry downgrade_table[] = { +static const struct upgrade_downgrade_entry downgrade_table[] = { #define x(ver, passes, ...) { \ .recovery_passes = passes, \ .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(ver_##errors), \ - .errors = ver_##errors, \ + .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ + .errors = downgrade_##ver##_errors, \ }, DOWNGRADE_TABLE() #undef x @@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) darray_char table = {}; int ret = 0; - for (const struct downgrade_entry *src = downgrade_table; + for (const struct upgrade_downgrade_entry *src = downgrade_table; src < downgrade_table + ARRAY_SIZE(downgrade_table); src++) { if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h index bc48fd2ca7..57e6c916fc 100644 --- a/fs/bcachefs/sb-downgrade.h +++ b/fs/bcachefs/sb-downgrade.h @@ -5,6 +5,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); +void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 3504c2d09c..c08aacdfd0 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -248,7 +248,9 @@ x(root_inode_not_dir, 240) \ x(dir_loop, 241) \ x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index bed0f857fe..eff5ce18c6 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "(never)"); prt_newline(out); + prt_printf(out, "Last superblock write:"); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.seq)); + prt_newline(out); + prt_printf(out, "State:"); prt_tab(out); prt_printf(out, "%s", @@ -246,7 +251,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); @@ -254,11 +259,16 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Has data:"); prt_tab(out); if (data_have) - prt_bitflags(out, bch2_data_types, data_have); + prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); + prt_str(out, "Durability:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_newline(out); + prt_printf(out, "Discard:"); prt_tab(out); prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); @@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - struct bch_dev *ca; - unsigned i, e; rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + for_each_member_device_rcu(c, ca, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } rcu_read_unlock(); @@ -413,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca) m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); - m->errors_reset_time = ktime_get_real_seconds(); + m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 03613e3eb8..be0a941832 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H +#include "darray.h" + extern char * const bch2_member_error_strs[]; static inline struct bch_member * @@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs.nr; i++) - if (devs.devs[i] == dev) + darray_for_each(devs, i) + if (*i == dev) return true; - return false; } static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs->nr; i++) - if (devs->devs[i] == dev) { - array_remove_item(devs->devs, devs->nr, i); + darray_for_each(*devs, i) + if (*i == dev) { + darray_remove_item(devs, i); return; } } @@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; + BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); + devs->data[devs->nr++] = dev; } } static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) { - return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; + return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; } -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - const struct bch_devs_mask *mask) +static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, + const struct bch_devs_mask *mask) { struct bch_dev *ca = NULL; - while ((*iter = mask - ? find_next_bit(mask->d, c->sb.nr_devices, *iter) - : *iter) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], + while ((idx = mask + ? find_next_bit(mask->d, c->sb.nr_devices, idx) + : idx) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[idx], lockdep_is_held(&c->state_lock)))) - (*iter)++; + idx++; return ca; } -#define for_each_member_device_rcu(ca, c, iter, mask) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, + const struct bch_devs_mask *mask) +{ + return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); +} -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +#define for_each_member_device_rcu(_c, _ca, _mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = __bch2_next_dev((_c), _ca, (_mask)));) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->ref); rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter, NULL))) + if ((ca = __bch2_next_dev(c, ca, NULL))) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter /* * If you break early, you must drop your ref on the current device */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch2_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) +#define __for_each_member_device(_c, _ca) \ + for (; (_ca = bch2_get_next_dev(_c, _ca));) + +#define for_each_member_device(_c, _ca) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_dev(_c, _ca));) static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) + struct bch_dev *ca, + unsigned state_mask) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->io_ref); rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter, NULL)) && + while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; + ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) +#define __for_each_online_member(_c, _ca, state_mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) +#define for_each_online_member(c, ca) \ + __for_each_online_member(c, ca, ~0) -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) +#define for_each_rw_member(c, ca) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) +#define for_each_readable_member(c, ca) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) /* * If a key exists that references a device, the device won't be going away and @@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { struct bch_devs_mask devs; - struct bch_dev *ca; - unsigned i; memset(&devs, 0, sizeof(devs)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) __set_bit(ca->dev_idx, devs.d); return devs; } diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 97790445e6..3a494c5d12 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, } EXPORT_SYMBOL_GPL(six_relock_ip); -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER +#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN -static inline bool six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_owner_running(struct six_lock *lock) { - struct task_struct *owner; - bool ret; - - if (need_resched()) - return false; - + /* + * When there's no owner, we might have preempted between the owner + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ rcu_read_lock(); - owner = READ_ONCE(lock->owner); - ret = !owner || owner_on_cpu(owner); + struct task_struct *owner = READ_ONCE(lock->owner); + bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); rcu_read_unlock(); return ret; } -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner, - u64 end_time) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { - bool ret = true; unsigned loop = 0; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner_on_cpu(owner) || need_resched()) { - ret = false; - break; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; u64 end_time; if (type == SIX_LOCK_write) return false; - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; + if (lock->wait_list.next != &wait->list) + return false; - if (!osq_lock(&lock->osq)) - goto fail; + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + preempt_disable(); end_time = sched_clock() + 10 * NSEC_PER_USEC; - while (1) { - struct task_struct *owner; - + while (!need_resched() && six_owner_running(lock)) { /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner, end_time)) - break; - - if (do_six_trylock(lock, type, false)) { - osq_unlock(&lock->osq); + if (smp_load_acquire(&wait->lock_acquired)) { preempt_enable(); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); break; + } /* * The cpu_relax() call is a compiler barrier which forces @@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type cpu_relax(); } - osq_unlock(&lock->osq); -fail: preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - return false; } -#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */ +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { return false; } @@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, trace_contention_begin(lock, 0); lock_contended(&lock->dep_map, ip); - if (six_optimistic_spin(lock, type)) - goto out; - wait->task = current; wait->lock_want = type; wait->lock_acquired = false; @@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, ret = 0; } + if (six_optimistic_spin(lock, wait, type)) + goto out; + while (1) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index 4c268b0b83..68d46fd7f3 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -15,7 +15,7 @@ * will have to take write locks for the full duration of the operation. * * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at thte start of the operation, + * not with readers, we can take intent locks at the start of the operation, * and then take write locks only for the actual update to each individual * nodes, without deadlocking. * @@ -65,8 +65,8 @@ * * Reentrancy: * - * Six locks are not by themselves reentrent, but have counters for both the - * read and intent states that can be used to provide reentrency by an upper + * Six locks are not by themselves reentrant, but have counters for both the + * read and intent states that can be used to provide reentrancy by an upper * layer that tracks held locks. If a lock is known to already be held in the * read or intent state, six_lock_increment() can be used to bump the "lock * held in this state" counter, increasing the number of unlock calls that @@ -127,10 +127,6 @@ #include <linux/sched.h> #include <linux/types.h> -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER -#include <linux/osq_lock.h> -#endif - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -143,9 +139,6 @@ struct six_lock { unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER - struct optimistic_spin_queue osq; -#endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index bf5d6f4e9a..ac6ba04d55 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) struct snapshot_table *t; bool ret; - EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); rcu_read_lock(); t = rcu_dereference(c->snapshots); @@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) mutex_unlock(&c->snapshot_table_lock); } -int bch2_mark_snapshot(struct btree_trans *trans, +static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans, __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } @@ -330,6 +330,14 @@ err: return ret; } +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { @@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_subvolume s; bool found = false; int ret; @@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) continue; - s = bkey_s_c_to_subvolume(k); + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) continue; if (!BCH_SUBVOLUME_SNAP(s.v)) { @@ -582,19 +589,13 @@ fsck_err: */ int bch2_check_snapshot_trees(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); - - if (ret) - bch_err(c, "error %i checking snapshot trees", ret); + bch_err_fn(c, ret); return ret; } @@ -727,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans, return 0; memset(&s, 0, sizeof(s)); - memcpy(&s, k.v, bkey_val_bytes(k.k)); + memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); id = le32_to_cpu(s.parent); if (id) { @@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans, real_depth = bch2_snapshot_depth(c, parent_id); - if (le32_to_cpu(s.depth) != real_depth && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n %s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, + c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (!ret && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(!ret, c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -856,22 +854,17 @@ fsck_err: int bch2_check_snapshots(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - /* * We iterate backwards as checking/fixing the depth field requires that * the parent's depth already be correct: */ - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -1060,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); n->v.depth = cpu_to_le32(depth); + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); @@ -1067,7 +1062,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) goto err; @@ -1315,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; struct bkey_i_snapshot *s; - u32 *i; int ret; if (k.k->type != KEY_TYPE_snapshot) @@ -1368,23 +1362,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_snapshot snap; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 *i, id; + u32 id; int ret = 0; - if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { + if (!test_bit(BCH_FS_started, &c->flags)) { ret = bch2_fs_read_write_early(c); - if (ret) { - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + if (ret) return ret; - } } trans = bch2_trans_get(c); @@ -1397,37 +1387,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) POS_MIN, 0, k, NULL, NULL, 0, bch2_delete_redundant_snapshot(trans, k)); - if (ret) { - bch_err_msg(c, ret, "deleting redundant snapshots"); + bch_err_msg(c, ret, "deleting redundant snapshots"); + if (ret) goto err; - } - ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, bch2_snapshot_set_equiv(trans, k)); - if (ret) { - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + if (ret) goto err; - } - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ if (k.k->type != KEY_TYPE_snapshot) continue; - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - bch_err_msg(c, ret, "walking snapshots"); + BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) + ? snapshot_list_add(c, &deleted, k.k->p.offset) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); + if (ret) goto err; - } for (id = 0; id < BTREE_ID_NR; id++) { struct bpos last_pos = POS_MIN; @@ -1449,36 +1431,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, move_key_to_correct_snapshot(trans, &iter, k)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); - if (ret) { - bch_err_msg(c, ret, "deleting keys from dying snapshots"); + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) goto err; - } } bch2_trans_unlock(trans); down_write(&c->snapshot_create_lock); - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ u32 snapshot = k.k->p.offset; u32 equiv = bch2_snapshot_equiv(c, snapshot); - if (equiv != snapshot) - snapshot_list_add(c, &deleted_interior, snapshot); - } - bch2_trans_iter_exit(trans, &iter); + equiv != snapshot + ? snapshot_list_add(c, &deleted_interior, snapshot) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err_create_lock; @@ -1489,7 +1471,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) goto err_create_lock; @@ -1497,19 +1479,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) darray_for_each(deleted, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } darray_for_each(deleted_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } err_create_lock: up_write(&c->snapshot_create_lock); @@ -1517,8 +1497,7 @@ err: darray_exit(&deleted_interior); darray_exit(&deleted); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1680,7 +1659,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct if (BCH_SNAPSHOT_DELETED(snap.v) || bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return 0; } @@ -1689,21 +1668,16 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct int bch2_snapshots_read(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: bch2_snapshot_set_equiv(trans, k) ?: bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index f09a22f442..7c66ffc063 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ .val_to_text = bch2_snapshot_to_text, \ - .atomic_trigger = bch2_mark_snapshot, \ + .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ }) @@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (*i == id) return true; @@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (bch2_snapshot_is_ancestor(c, id, *i)) return true; diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h new file mode 100644 index 0000000000..aabcd3a74c --- /dev/null +++ b/fs/bcachefs/snapshot_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H +#define _BCACHEFS_SNAPSHOT_FORMAT_H + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; + bch_le128 btime; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ae21a8cca1..fcaa5a8887 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,6 +15,16 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +typedef unsigned __bitwise bch_str_hash_flags_t; + +enum bch_str_hash_flags { + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) + static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -150,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -185,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans, } static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + +static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, @@ -246,7 +264,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - int flags, + bch_str_hash_flags_t str_hash_flags, int update_flags) { struct btree_iter iter, slot = { NULL }; @@ -269,7 +287,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, } if (!slot.path && - !(flags & BCH_HASH_SET_MUST_REPLACE)) + !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -287,16 +305,16 @@ found: found = true; not_found: - if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, 0); + ret = bch2_trans_update(trans, &iter, insert, update_flags); } goto out; @@ -307,7 +325,8 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, - struct bkey_i *insert, int flags) + struct bkey_i *insert, + bch_str_hash_flags_t str_hash_flags) { u32 snapshot; int ret; @@ -319,7 +338,7 @@ int bch2_hash_set(struct btree_trans *trans, insert->k.p.inode = inum.inum; return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, flags, 0); + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 22b34a8e4d..7c67c28d3e 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans, return ret; if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - bch2_fs_lazy_rw(c); - ret = bch2_subvolume_delete(trans, iter->pos.offset); - if (ret) - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); return ret ?: -BCH_ERR_transaction_restart_nested; } @@ -82,17 +79,12 @@ fsck_err: int bch2_check_subvols(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, */ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) { - struct btree_iter iter; - struct bkey_s_c k; struct bch_subvolume s; return lockrestart_do(trans, @@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ITER_CACHED, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.parent))); } @@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { return bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); } @@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - if (ret) { - bch_err_msg(c, ret, "deleting subvolume %u", *id); + bch_err_msg(c, ret, "deleting subvolume %u", *id); + if (ret) break; - } } darray_exit(&s); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h new file mode 100644 index 0000000000..af79134b07 --- /dev/null +++ b/fs/bcachefs/subvolume_format.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H +#define _BCACHEFS_SUBVOLUME_FORMAT_H + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 2d2e66a4e4..ae644adfc3 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,11 @@ struct snapshot_t { }; struct snapshot_table { +#ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif }; typedef struct { diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index c925dc5742..36988add58 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -2,7 +2,6 @@ #include "bcachefs.h" #include "checksum.h" -#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -13,6 +12,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" @@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { struct bch2_metadata_version { u16 version; const char *name; - u64 recovery_passes; }; static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v, _recovery_passes) { \ +#define x(n, v) { \ .version = v, \ .name = #n, \ - .recovery_passes = _recovery_passes, \ }, BCH_METADATA_VERSIONS() #undef x @@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - u64 ret = 0; - - for (const struct bch2_metadata_version *i = bch2_metadata_versions; - i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); - i++) - if (i->version > old_version && i->version <= new_version) { - if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) - ret |= bch2_fsck_recovery_passes(); - ret |= i->recovery_passes; - } - - return ret &= ~RECOVERY_PASS_ALL_FSCK; -} - const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) { - struct bch_sb_field *f; - /* XXX: need locking around superblock to access optional fields */ vstruct_for_each(sb, f) @@ -164,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->holder); + if (!IS_ERR_OR_NULL(sb->bdev_handle)) + bdev_release(sb->bdev_handle); kfree(sb->holder); kfree(sb->sb_name); @@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { - pr_err("%pg: superblock too big: want %zu but have %llu", - sb->bdev, new_bytes, max_bytes); + struct printbuf buf = PRINTBUF; + + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); + pr_err("%s", buf.buf); + printbuf_exit(&buf); return -BCH_ERR_ENOSPC_sb; } } @@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, if (sb->fs_sb) { struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { @@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, int rw) { struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field *f; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; u16 block_size; @@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb.sb; - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); @@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c) le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); ca->mi = bch2_mi_to_cpu(&m); } @@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) dst->time_base_lo = src->time_base_lo; dst->time_base_hi = src->time_base_hi; dst->time_precision = src->time_precision; + dst->write_time = src->write_time; memcpy(dst->flags, src->flags, sizeof(dst->flags)); memcpy(dst->features, src->features, sizeof(dst->features)); @@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { - struct bch_csum csum; size_t bytes; int ret; reread: @@ -650,7 +627,9 @@ reread: if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_printf(err, "Not a bcachefs superblock"); + prt_str(err, "Not a bcachefs superblock (got magic "); + pr_uuid(err, sb->sb->magic.b); + prt_str(err, ")"); return -BCH_ERR_invalid_sb_magic; } @@ -673,17 +652,16 @@ reread: goto reread; } - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); + if (csum_type >= BCH_CSUM_NR) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } /* XXX: verify MACs */ - csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), - null_nonce(), sb->sb); - + struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); if (bch2_crc_cmp(csum, sb->sb->csum)) { - prt_printf(err, "bad checksum"); + bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); return -BCH_ERR_invalid_sb_csum; } @@ -692,12 +670,13 @@ reread: return 0; } -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) +static int __bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb, bool ignore_notbchfs_msg) { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; struct printbuf err = PRINTBUF; + struct printbuf err2 = PRINTBUF; __le64 *i; int ret; #ifndef __KERNEL__ @@ -725,21 +704,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev) && - PTR_ERR(sb->bdev) == -EACCES && + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev_handle) && + PTR_ERR(sb->bdev_handle) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev)) + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev_handle)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev)) { - ret = PTR_ERR(sb->bdev); - goto out; + if (IS_ERR(sb->bdev_handle)) { + ret = PTR_ERR(sb->bdev_handle); + goto err; } + sb->bdev = sb->bdev_handle->bdev; ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -760,8 +740,14 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); + if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) + printk(KERN_INFO "%s", err2.buf); + else + printk(KERN_ERR "%s", err2.buf); + + printbuf_exit(&err2); printbuf_reset(&err); /* @@ -837,6 +823,20 @@ err_no_print: goto out; } +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, false); +} + +/* provide a silenced version for mount.bcachefs */ + +int bch2_read_super_silent(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, true); +} + /* write superblock: */ static void write_super_endio(struct bio *bio) @@ -905,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; - struct bch_dev *ca; struct printbuf err = PRINTBUF; - unsigned i, sb = 0, nr_wrote; + unsigned sb = 0, nr_wrote; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -929,9 +928,14 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); - if (test_bit(BCH_FS_ERROR, &c->flags)) + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + for_each_online_member(c, ca) + __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); + + if (test_bit(BCH_FS_error, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + if (test_bit(BCH_FS_topology_error, &c->flags)) SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); @@ -942,10 +946,10 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_errors_from_cpu(c); bch2_sb_downgrade_update(c); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_sb_from_fs(c, ca); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { printbuf_reset(&err); ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); @@ -978,16 +982,16 @@ int bch2_write_super(struct bch_fs *c) return -BCH_ERR_sb_not_downgraded; } - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { __set_bit(ca->dev_idx, sb_written.d); ca->sb_write_error = 0; } - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) read_back_super(c, ca); closure_sync(cl); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) continue; @@ -1014,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c) do { wrote = false; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (!ca->sb_write_error && sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); @@ -1024,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c) sb++; } while (wrote); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); else @@ -1036,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c) can_mount_with_written = bch2_have_enough_devs(c, sb_written, degraded_flags, false); - for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = @@ -1193,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, return ret; } -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) +void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { unsigned type = le32_to_cpu(f->type); const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); @@ -1202,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); + if (ops->to_text) + ops->to_text(out, sb, f); +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); + if (type < BCH_SB_FIELD_NR) prt_printf(out, "%s", bch2_sb_fields[type]); else @@ -1210,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, " (size %zu):", vstruct_bytes(f)); prt_newline(out); - if (ops->to_text) { - printbuf_indent_add(out, 2); - ops->to_text(out, sb, f); - printbuf_indent_sub(out, 2); - } + __bch2_sb_field_to_text(out, sb, f); } void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) @@ -1243,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - struct bch_sb_field *f; u64 fields_have = 0; unsigned nr_devices = 0; @@ -1263,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, pr_uuid(out, sb->uuid.b); prt_newline(out); + prt_printf(out, "Magic number:"); + prt_tab(out); + pr_uuid(out, sb->magic.b); + prt_newline(out); + prt_str(out, "Device index:"); prt_tab(out); prt_printf(out, "%u", sb->dev_idx); @@ -1301,9 +1314,16 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); + prt_printf(out, "Time of last write:"); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); + prt_newline(out); + prt_printf(out, "Superblock size:"); prt_tab(out); - prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_units_u64(out, vstruct_bytes(sb)); + prt_str(out, "/"); + prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); prt_printf(out, "Clean:"); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index e41e5de531..95e80e0631 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, unsigned); unsigned bch2_latest_compatible_version(unsigned); -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned, - unsigned); - static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { return le32_to_cpu(f->u64s) * sizeof(u64); @@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); +void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 818ec467a0..6b23e11825 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -23,7 +23,6 @@ #include "checksum.h" #include "clock.h" #include "compress.h" -#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -49,6 +48,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" @@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); +const char * const bch2_fs_flag_strs[] = { +#define x(n) #n, + BCH_FS_FLAGS() +#undef x + NULL +}; + +void __bch2_print(struct bch_fs *c, const char *fmt, ...) +{ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + unsigned long flags; + + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + spin_lock_irqsave(&stdio->output_lock, flags); + prt_vprintf(&stdio->output_buf, fmt, args); + spin_unlock_irqrestore(&stdio->output_lock, flags); + + wake_up(&stdio->output_wait); + } + va_end(args); +} + #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ @@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { struct bch_fs *c; - struct bch_dev *ca; - unsigned i; mutex_lock(&bch_fs_list_lock); rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); goto found; @@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) static void bch2_dev_usage_journal_reserve(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, nr = 0, u64s = + unsigned nr = 0, u64s = ((sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / sizeof(u64); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) nr++; rcu_read_unlock(); @@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) static void __bch2_fs_read_only(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, clean_passes = 0; + unsigned clean_passes = 0; u64 seq = 0; bch2_fs_ec_stop(c); @@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c) journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + !test_bit(BCH_FS_emergency_ro, &c->flags)) + set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); /* * After stopping journal: */ - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_allocator_remove(c, ca); } @@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags)) { + if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } - BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); + + bch_verbose(c, "going read-only"); /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ - set_bit(BCH_FS_GOING_RO, &c->flags); + set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else @@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c) * that going RO is complete: */ wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || - test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags) || + test_bit(BCH_FS_emergency_ro, &c->flags)); + + bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); + if (writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags)); - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_GOING_RO, &c->flags); + if (!writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); + + clear_bit(BCH_FS_write_disable_complete, &c->flags); + clear_bit(BCH_FS_going_ro, &c->flags); + clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && - test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_emergency_ro, &c->flags) && + test_bit(BCH_FS_started, &c->flags) && + test_bit(BCH_FS_clean_shutdown, &c->flags) && !c->opts.norecovery) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); + } else { + bch_verbose(c, "done going read-only, filesystem not clean"); } - - clear_bit(BCH_FS_RW, &c->flags); } static void bch2_fs_read_only_work(struct work_struct *work) @@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c) bool bch2_fs_emergency_read_only(struct bch_fs *c) { - bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); @@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c) static int __bch2_fs_read_write(struct bch_fs *c, bool early) { - struct bch_dev *ca; - unsigned i; int ret; - if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) return 0; - if (c->opts.norecovery) - return -BCH_ERR_erofs_norecovery; - - /* - * nochanges is used for fsck -n mode - we have to allow going rw - * during recovery for that to work: - */ - if (c->opts.nochanges && (!early || c->opts.read_only)) - return -BCH_ERR_erofs_nochanges; - bch_info(c, "going read-write"); ret = bch2_sb_members_v2_init(c); @@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we @@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) */ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); + set_bit(BCH_FS_rw, &c->flags); + set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else - for (i = 0; i < BCH_WRITE_REF_NR; i++) { + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } @@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_pending_node_rewrites(c); return 0; err: - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); @@ -472,6 +497,12 @@ err: int bch2_fs_read_write(struct bch_fs *c) { + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + + if (c->opts.nochanges) + return -BCH_ERR_erofs_nochanges; + return __bch2_fs_read_write(c, false); } @@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj) void __bch2_fs_stop(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - bch_verbose(c, "shutting down"); - set_bit(BCH_FS_STOPPING, &c->flags); + set_bit(BCH_FS_stopping, &c->flags); cancel_work_sync(&c->journal_seq_blacklist_gc_work); @@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); @@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + bch2_ro_ref_put(c); + wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); @@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c) /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); @@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c) static int bch2_fs_online(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; lockdep_assert_held(&bch_fs_list_lock); @@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c) ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: +#endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { @@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c) down_write(&c->state_lock); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); @@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } + c->stdio = (void *)(unsigned long) opts.stdio; + __module_get(THIS_MODULE); closure_init(&c->cl, NULL); @@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); + sema_init(&c->online_fsck_mutex, 1); + init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); @@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); @@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -847,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - btree_bytes(c)) || + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); - unsigned i; int ret; print_mount_opts(c); down_write(&c->state_lock); - BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); @@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(ca, c, i) - bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); + for_each_online_member(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); mutex_unlock(&c->sb_lock); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - set_bit(BCH_FS_STARTED, &c->flags); + set_bit(BCH_FS_started, &c->flags); - if (c->opts.read_only || c->opts.nochanges) { + if (c->opts.read_only) { bch2_fs_read_only(c); } else { - ret = !test_bit(BCH_FS_RW, &c->flags) + ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) @@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c) } ret = 0; -out: +err: + if (ret) + bch_err_msg(c, ret, "starting filesystem"); + else + bch_verbose(c, "done starting filesystem"); up_write(&c->state_lock); return ret; -err: - bch_err_msg(c, ret, "starting filesystem"); - goto out; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) @@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) return 0; } -static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +static int bch2_dev_in_fs(struct bch_sb_handle *fs, + struct bch_sb_handle *sb) { - struct bch_sb *newest = - le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + if (fs == sb) + return 0; - if (!uuid_equal(&fs->uuid, &sb->uuid)) + if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(newest, sb->dev_idx)) + if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; - if (fs->block_size != sb->block_size) + if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; + if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || + le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) + return 0; + + if (fs->sb->seq == sb->sb->seq && + fs->sb->write_time != sb->sb->write_time) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + prt_newline(&buf); + + prt_bdevname(&buf, sb->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + prt_newline(&buf); + + prt_printf(&buf, "Not using older sb"); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + + struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); + u64 seq_from_fs = le64_to_cpu(m.seq); + u64 seq_from_member = le64_to_cpu(sb->sb->seq); + + if (seq_from_fs && seq_from_fs < seq_from_member) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_str(&buf, "believes seq of "); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " to be %llu, but ", seq_from_fs); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " has %llu\n", seq_from_member); + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + return 0; } @@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); + struct printbuf name = PRINTBUF; + prt_bdevname(&name, ca->disk_sb.bdev); + if (c->sb.nr_devices == 1) - snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); - snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); + strscpy(c->name, name.buf, sizeof(c->name)); + strscpy(ca->name, name.buf, sizeof(ca->name)); + + printbuf_exit(&name); rebalance_wakeup(c); return 0; @@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; - struct bch_dev *ca2; - int i, nr_rw = 0, required; + int nr_rw = 0, required; lockdep_assert_held(&c->state_lock); @@ -1320,16 +1422,16 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to write to? */ - for_each_member_device(ca2, c, i) + for_each_member_device(c, ca2) if (ca2 != ca) nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas - : c->opts.metadata_replicas_required, + : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas - : c->opts.data_replicas_required); + : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: @@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); - if (ret) - bch_err_msg(c, ret, "removing dev alloc info"); - + bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1497,40 +1597,35 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - if (ret) { - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "dropping data"); + if (ret) goto err; - } ret = bch2_dev_remove_alloc(c, ca); - if (ret) { - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "deleting alloc info"); + if (ret) goto err; - } ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - if (ret) { - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "flushing journal"); + if (ret) goto err; - } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err(ca, "journal error"); + bch_err(ca, "journal error"); + if (ret) goto err; - } ret = bch2_replicas_gc2(c); - if (ret) { - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "in replicas_gc2()"); + if (ret) goto err; - } data = bch2_dev_has_data(c, ca); if (data) { struct printbuf data_has = PRINTBUF; - prt_bitflags(&data_has, bch2_data_types, data); + prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; @@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) int ret; ret = bch2_read_super(path, &opts, &sb); - if (ret) { - bch_err_msg(c, ret, "reading super"); + bch_err_msg(c, ret, "reading super"); + if (ret) goto err; - } dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); @@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } ret = bch2_dev_may_add(sb.sb, c); - if (ret) { - bch_err_fn(c, ret); + if (ret) goto err; - } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { @@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; ret = bch2_dev_journal_alloc(ca); - if (ret) { - bch_err_msg(c, ret, "allocating journal"); + bch_err_msg(c, ret, "allocating journal"); + if (ret) goto err; - } down_write(&c->state_lock); mutex_lock(&c->sb_lock); ret = bch2_sb_from_fs(c, ca); - if (ret) { - bch_err_msg(c, ret, "setting up new superblock"); + bch_err_msg(c, ret, "setting up new superblock"); + if (ret) goto err_unlock; - } if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; @@ -1681,10 +1771,9 @@ have_slot: if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); - if (ret) { - bch_err_msg(c, ret, "creating new label"); + bch_err_msg(c, ret, "creating new label"); + if (ret) goto err_unlock; - } } bch2_write_super(c); @@ -1693,16 +1782,14 @@ have_slot: bch2_dev_usage_journal_reserve(c); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(ca, ret, "marking new superblock"); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) goto err_late; - } ret = bch2_fs_freespace_init(c); - if (ret) { - bch_err_msg(ca, ret, "initializing free space"); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) goto err_late; - } ca->new_fs_bucket_idx = 0; @@ -1721,6 +1808,7 @@ err: bch2_free_super(&sb); printbuf_exit(&label); printbuf_exit(&errbuf); + bch_err_fn(c, ret); return ret; err_late: up_write(&c->state_lock); @@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (ret) { - bch_err_msg(c, ret, "bringing %s online", path); + ret = bch2_dev_in_fs(&c->disk_sb, &sb); + bch_err_msg(c, ret, "bringing %s online", path); + if (ret) goto err; - } ret = bch2_dev_attach_bdev(c, &sb); if (ret) @@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ca = bch_dev_locked(c, dev_idx); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + if (ret) goto err; - } if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); @@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } ret = bch2_dev_buckets_resize(c, ca, nbuckets); - if (ret) { - bch_err_msg(ca, ret, "resizing buckets"); + bch_err_msg(ca, ret, "resizing buckets"); + if (ret) goto err; - } ret = bch2_trans_mark_dev_sb(c, ca); if (ret) @@ -1879,28 +1964,30 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) - if (!strcmp(name, ca->name)) - goto found; - ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -found: + for_each_member_device_rcu(c, ca, NULL) + if (!strcmp(name, ca->name)) { + rcu_read_unlock(); + return ca; + } rcu_read_unlock(); - - return ca; + return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } /* Filesystem open: */ +static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) +{ + return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: + cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); +} + struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - struct bch_sb_handle *sb, *best = NULL; + struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, BUG_ON(darray_push(&sbs, sb)); } + if (opts.nochanges && !opts.read_only) { + ret = -BCH_ERR_erofs_nochanges; + goto err_print; + } + darray_for_each(sbs, sb) - if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { - if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb->bdev); + ret = bch2_dev_in_fs(best, sb); + + if (ret == -BCH_ERR_device_has_been_removed || + ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; + ret = 0; continue; } - ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index bf762df180..dada09331d 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -8,6 +8,8 @@ #include <linux/math64.h> +extern const char * const bch2_fs_flag_strs[]; + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags) && - !test_bit(BCH_FS_WAS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags) && + !test_bit(BCH_FS_was_rw, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 9c1fd4ca2b..0e5a14fc8e 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,6 +4,7 @@ struct bch_sb_handle { struct bch_sb *sb; + struct bdev_handle *bdev_handle; struct block_device *bdev; char *sb_name; struct bio *bio; @@ -22,7 +23,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_BKEY_PTRS_MAX]; + u8 data[BCH_BKEY_PTRS_MAX]; }; struct bch_member_cpu { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index f3cb7115b5..cee80c47fe 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "btree_gc.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -145,6 +146,7 @@ rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); +read_attribute(flags); read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); @@ -246,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) mutex_lock(&c->btree_cache.lock); list_for_each_entry(b, &c->btree_cache.live, list) - ret += btree_bytes(c); + ret += btree_buf_bytes(b); mutex_unlock(&c->btree_cache.lock); return ret; @@ -255,19 +257,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; - u64 nr_uncompressed_extents = 0, - nr_compressed_extents = 0, - nr_incompressible_extents = 0, - uncompressed_sectors = 0, - incompressible_sectors = 0, - compressed_sectors_compressed = 0, - compressed_sectors_uncompressed = 0; + struct compression_type_stats { + u64 nr_extents; + u64 sectors_compressed; + u64 sectors_uncompressed; + } s[BCH_COMPRESSION_TYPE_NR]; + u64 compressed_incompressible = 0; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + memset(s, 0, sizeof(s)); + + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; trans = bch2_trans_get(c); @@ -276,39 +277,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key2(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = for_each_btree_key(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool compressed = false, uncompressed = false, incompressible = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - switch (p.crc.compression_type) { - case BCH_COMPRESSION_TYPE_none: - uncompressed = true; - uncompressed_sectors += k.k->size; - break; - case BCH_COMPRESSION_TYPE_incompressible: - incompressible = true; - incompressible_sectors += k.k->size; - break; - default: - compressed_sectors_compressed += - p.crc.compressed_size; - compressed_sectors_uncompressed += - p.crc.uncompressed_size; - compressed = true; - break; + bool compressed = false, incompressible = false; + + bkey_for_each_crc(k.k, ptrs, crc, entry) { + incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; + compressed |= crc_is_compressed(crc); + + if (crc_is_compressed(crc)) { + s[crc.compression_type].nr_extents++; + s[crc.compression_type].sectors_compressed += crc.compressed_size; + s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size; } } - if (incompressible) - nr_incompressible_extents++; - else if (uncompressed) - nr_uncompressed_extents++; - else if (compressed) - nr_compressed_extents++; + compressed_incompressible += compressed && incompressible; + + if (!compressed) { + unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0; + + s[t].nr_extents++; + s[t].sectors_compressed += k.k->size; + s[t].sectors_uncompressed += k.k->size; + } 0; })); } @@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_printf(out, "uncompressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, uncompressed_sectors << 9); - prt_printf(out, "\n"); + prt_str(out, "type"); + printbuf_tabstop_push(out, 12); + prt_tab(out); - prt_printf(out, "compressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); - prt_printf(out, " compressed size: "); - prt_human_readable_u64(out, compressed_sectors_compressed << 9); - prt_printf(out, "\n"); - prt_printf(out, " uncompressed size: "); - prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); - prt_printf(out, "\n"); + prt_str(out, "compressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "uncompressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "average extent size"); + printbuf_tabstop_push(out, 24); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { + bch2_prt_compression_type(out, i); + prt_tab(out); + + prt_human_readable_u64(out, s[i].sectors_compressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].sectors_uncompressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].nr_extents + ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents) + : 0); + prt_tab_rjust(out); + prt_newline(out); + } + + if (compressed_incompressible) { + prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible); + prt_newline(out); + } - prt_printf(out, "incompressible:\n"); - prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, incompressible_sectors << 9); - prt_printf(out, "\n"); return 0; } @@ -370,6 +384,9 @@ SHOW(bch2_fs) sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + if (attr == &sysfs_flags) + prt_bitflags(out, bch2_fs_flag_strs, c->flags); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); if (attr == &sysfs_btree_write_stats) @@ -483,12 +500,12 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; /* Debugging: */ - if (!test_bit(BCH_FS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -620,6 +637,7 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { + &sysfs_flags, &sysfs_journal_debug, &sysfs_btree_updates, &sysfs_btree_cache, @@ -708,8 +726,10 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); - if ((id == Opt_background_target || - id == Opt_background_compression) && v) + if (v && + (id == Opt_background_target || + id == Opt_background_compression || + (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); ret = size; @@ -786,32 +806,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); - - for (i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); - prt_tab(out); - prt_u64(out, stats.d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_str(out, "ec"); - prt_tab(out); - prt_u64(out, stats.buckets_ec); - prt_tab_rjust(out); - prt_newline(out); + bch2_dev_usage_to_text(out, &stats); prt_newline(out); @@ -891,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 1; i < BCH_DATA_NR; i++) prt_printf(out, "%-12s:%12llu\n", - bch2_data_types[i], + bch2_data_type_str(i), percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } } @@ -916,7 +911,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_has_data) { - prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); prt_char(out, '\n'); } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 2fc9e60c75..b3fe9fc577 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -107,9 +107,6 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_slots(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr * 2); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i >= nr * 2) - break; + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); - i++; - 0; - })); - if (ret < 0) { - bch_err_msg(c, ret, "error iterating forwards by slots"); - goto err; - } - ret = 0; -err: - bch2_trans_put(trans); + i++; + 0; + }))); + bch_err_msg(c, ret, "error iterating forwards by slots"); return ret; } static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - })); + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards by slots"); - if (ret) - goto err; - ret = 0; -err: - bch2_trans_put(trans); - return 0; + return ret; } /* @@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_cookie insert; bkey_cookie_init(&insert.k_i); @@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, - for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr) static int seq_overwrite(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c new file mode 100644 index 0000000000..9220d7de10 --- /dev/null +++ b/fs/bcachefs/thread_with_file.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "printbuf.h" +#include "thread_with_file.h" + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/poll.h> + +void bch2_thread_with_file_exit(struct thread_with_file *thr) +{ + if (thr->task) { + kthread_stop(thr->task); + put_task_struct(thr->task); + } +} + +int bch2_run_thread_with_file(struct thread_with_file *thr, + const struct file_operations *fops, + int (*fn)(void *)) +{ + struct file *file = NULL; + int ret, fd = -1; + unsigned fd_flags = O_CLOEXEC; + + if (fops->read && fops->write) + fd_flags |= O_RDWR; + else if (fops->read) + fd_flags |= O_RDONLY; + else if (fops->write) + fd_flags |= O_WRONLY; + + char name[TASK_COMM_LEN]; + get_task_comm(name, current); + + thr->ret = 0; + thr->task = kthread_create(fn, thr, "%s", name); + ret = PTR_ERR_OR_ZERO(thr->task); + if (ret) + return ret; + + ret = get_unused_fd_flags(fd_flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile(name, fops, thr, fd_flags); + ret = PTR_ERR_OR_ZERO(file); + if (ret) + goto err; + + get_task_struct(thr->task); + wake_up_process(thr->task); + fd_install(fd, file); + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (thr->task) + kthread_stop(thr->task); + return ret; +} + +static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +{ + return thr->stdio.output_buf.pos || + thr->output2.nr || + thr->thr.done; +} + +static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + size_t copied = 0, b; + int ret = 0; + + if ((file->f_flags & O_NONBLOCK) && + !thread_with_stdio_has_output(thr)) + return -EAGAIN; + + ret = wait_event_interruptible(thr->stdio.output_wait, + thread_with_stdio_has_output(thr)); + if (ret) + return ret; + + if (thr->thr.done) + return 0; + + while (len) { + ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); + if (ret) + break; + + spin_lock_irq(&thr->stdio.output_lock); + b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); + + memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); + memmove(thr->stdio.output_buf.buf, + thr->stdio.output_buf.buf + b, + thr->stdio.output_buf.pos - b); + + thr->output2.nr += b; + thr->stdio.output_buf.pos -= b; + spin_unlock_irq(&thr->stdio.output_lock); + + b = min(len, thr->output2.nr); + if (!b) + break; + + b -= copy_to_user(buf, thr->output2.data, b); + if (!b) { + ret = -EFAULT; + break; + } + + copied += b; + buf += b; + len -= b; + + memmove(thr->output2.data, + thr->output2.data + b, + thr->output2.nr - b); + thr->output2.nr -= b; + } + + return copied ?: ret; +} + +static int thread_with_stdio_release(struct inode *inode, struct file *file) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + bch2_thread_with_file_exit(&thr->thr); + printbuf_exit(&thr->stdio.input_buf); + printbuf_exit(&thr->stdio.output_buf); + darray_exit(&thr->output2); + thr->exit(thr); + return 0; +} + +#define WRITE_BUFFER 4096 + +static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) +{ + return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; +} + +static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct printbuf *buf = &thr->stdio.input_buf; + size_t copied = 0; + ssize_t ret = 0; + + while (len) { + if (thr->thr.done) { + ret = -EPIPE; + break; + } + + size_t b = len - fault_in_readable(ubuf, len); + if (!b) { + ret = -EFAULT; + break; + } + + spin_lock(&thr->stdio.input_lock); + if (buf->pos < WRITE_BUFFER) + bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); + b = min(len, printbuf_remaining_size(buf)); + + if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { + ubuf += b; + len -= b; + copied += b; + buf->pos += b; + } + spin_unlock(&thr->stdio.input_lock); + + if (b) { + wake_up(&thr->stdio.input_wait); + } else { + if ((file->f_flags & O_NONBLOCK)) { + ret = -EAGAIN; + break; + } + + ret = wait_event_interruptible(thr->stdio.input_wait, + thread_with_stdio_has_input_space(thr)); + if (ret) + break; + } + } + + return copied ?: ret; +} + +static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output_wait, wait); + poll_wait(file, &thr->stdio.input_wait, wait); + + __poll_t mask = 0; + + if (thread_with_stdio_has_output(thr)) + mask |= EPOLLIN; + if (thread_with_stdio_has_input_space(thr)) + mask |= EPOLLOUT; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static const struct file_operations thread_with_stdio_fops = { + .release = thread_with_stdio_release, + .read = thread_with_stdio_read, + .write = thread_with_stdio_write, + .poll = thread_with_stdio_poll, + .llseek = no_llseek, +}; + +int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)) +{ + thr->stdio.input_buf = PRINTBUF; + thr->stdio.input_buf.atomic++; + spin_lock_init(&thr->stdio.input_lock); + init_waitqueue_head(&thr->stdio.input_wait); + + thr->stdio.output_buf = PRINTBUF; + thr->stdio.output_buf.atomic++; + spin_lock_init(&thr->stdio.output_lock); + init_waitqueue_head(&thr->stdio.output_wait); + + darray_init(&thr->output2); + thr->exit = exit; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); +} + +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + char *n = memchr(stdio->input_buf.buf, '\n', ret); + if (n) + ret = min(ret, n + 1 - stdio->input_buf.buf); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h new file mode 100644 index 0000000000..05879c5048 --- /dev/null +++ b/fs/bcachefs/thread_with_file.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_H +#define _BCACHEFS_THREAD_WITH_FILE_H + +#include "thread_with_file_types.h" + +struct task_struct; + +struct thread_with_file { + struct task_struct *task; + int ret; + bool done; +}; + +void bch2_thread_with_file_exit(struct thread_with_file *); +int bch2_run_thread_with_file(struct thread_with_file *, + const struct file_operations *, + int (*fn)(void *)); + +struct thread_with_stdio { + struct thread_with_file thr; + struct stdio_redirect stdio; + DARRAY(char) output2; + void (*exit)(struct thread_with_stdio *); +}; + +static inline void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input_wait); + wake_up(&thr->stdio.output_wait); +} + +int bch2_run_thread_with_stdio(struct thread_with_stdio *, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)); +int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); +int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h new file mode 100644 index 0000000000..90b5e645e9 --- /dev/null +++ b/fs/bcachefs/thread_with_file_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H + +struct stdio_redirect { + spinlock_t output_lock; + wait_queue_head_t output_wait; + struct printbuf output_buf; + + spinlock_t input_lock; + wait_queue_head_t input_wait; + struct printbuf input_buf; + bool done; +}; + +#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index fd49b63562..293b90d704 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos, TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) ); -DECLARE_EVENT_CLASS(bkey, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k), +DECLARE_EVENT_CLASS(fs_str, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str), TP_STRUCT__entry( - __string(k, k ) + __field(dev_t, dev ) + __string(str, str ) ), TP_fast_assign( - __assign_str(k, k); + __entry->dev = c->dev; + __assign_str(str, str); ), - TP_printk("%s", __get_str(k)) + TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node, +DECLARE_EVENT_CLASS(trans_str, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %pS %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) +); + +DECLARE_EVENT_CLASS(trans_str_nocaller, + TP_PROTO(struct btree_trans *trans, const char *str), + TP_ARGS(trans, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, __get_str(str)) +); + +DECLARE_EVENT_CLASS(btree_node_nofs, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b), @@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node, __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %s %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, + __entry->level, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs, TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); +DECLARE_EVENT_CLASS(btree_trans, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) +); + DECLARE_EVENT_CLASS(bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), @@ -183,9 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, journal_entry_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(fs_str, journal_entry_full, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, journal_entry_close, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, @@ -286,36 +381,36 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_cache_reap, +DEFINE_EVENT(btree_node_nofs, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); /* Btree */ DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_node_write, @@ -339,13 +434,13 @@ TRACE_EVENT(btree_node_write, ); DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_reserve_get_fail, @@ -377,28 +472,28 @@ TRACE_EVENT(btree_reserve_get_fail, ); DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_path_relock_fail, @@ -433,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail, __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; @@ -717,44 +812,32 @@ TRACE_EVENT(bucket_evacuate, __entry->dev_idx, __entry->bucket) ); -DEFINE_EVENT(bkey, move_extent, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_read, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_read, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_write, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_write, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_finish, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_finish, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(move_extent_fail, - TP_PROTO(struct bch_fs *c, const char *msg), - TP_ARGS(c, msg), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(msg, msg ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(msg, msg); - ), - - TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +DEFINE_EVENT(fs_str, move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_start_fail, +DEFINE_EVENT(fs_str, move_extent_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -930,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race, __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", @@ -987,10 +1070,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, +DEFINE_EVENT(trans_str, trans_restart_too_many_iters, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + const char *paths), + TP_ARGS(trans, caller_ip, paths) ); DECLARE_EVENT_CLASS(transaction_restart_iter, @@ -1036,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -struct get_locks_fail; - TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1056,8 +1138,6 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, level ) __field(u32, path_seq ) __field(u32, node_seq ) - __field(u32, path_alloc_seq ) - __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1070,12 +1150,10 @@ TRACE_EVENT(trans_restart_upgrade, __entry->level = f->l; __entry->path_seq = path->l[f->l].lock_seq; __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - __entry->path_alloc_seq = path->alloc_seq; - __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), @@ -1086,16 +1164,12 @@ TRACE_EVENT(trans_restart_upgrade, __entry->new_locks_want, __entry->level, __entry->path_seq, - __entry->node_seq, - __entry->path_alloc_seq, - __entry->downgrade_seq) + __entry->node_seq) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) +DEFINE_EVENT(trans_str, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, @@ -1160,10 +1234,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, +DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + const char *cycle), + TP_ARGS(trans, cycle) ); DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, @@ -1252,22 +1326,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), + struct btree_path *path, + unsigned old_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) + __field(unsigned, old_locks_want ) + __field(unsigned, new_locks_want ) + __field(unsigned, btree ) + TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = path->locks_want; + __entry->btree = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); ), - TP_printk("%s %pS", + TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", __entry->trans_fn, - (void *) __entry->caller_ip) + (void *) __entry->caller_ip, + __entry->old_locks_want, + __entry->new_locks_want, + bch2_btree_id_str(__entry->btree), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) ); DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, @@ -1298,21 +1387,48 @@ TRACE_EVENT(write_buffer_flush, __entry->nr, __entry->size, __entry->skipped, __entry->fast) ); +TRACE_EVENT(write_buffer_flush_sync, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), - TP_ARGS(trans, nr, size), + TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), + TP_ARGS(trans, slowpath, total), TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, size ) + __field(size_t, slowpath ) + __field(size_t, total ) ), TP_fast_assign( - __entry->nr = nr; - __entry->size = size; + __entry->slowpath = slowpath; + __entry->total = total; ), - TP_printk("%zu/%zu", __entry->nr, __entry->size) + TP_printk("%zu/%zu", __entry->slowpath, __entry->total) +); + +DEFINE_EVENT(fs_str, rebalance_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, data_update, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #endif /* _TRACE_BCACHEFS_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3b7c349f26..3a32faa86b 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_prt_u64_base2(struct printbuf *out, u64 v) +{ + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); +} + void bch2_print_string_as_lines(const char *prefix, const char *lines) { const char *p; @@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; - int ret = 0; stack->nr = 0; - ret = darray_make_room(stack, 32); + int ret = darray_make_room_gfp(stack, 32, gfp); if (ret) return ret; @@ -282,9 +287,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) return -1; do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && - !(ret = darray_make_room(stack, stack->size * 2))); + !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); stack->nr = nr_entries; up_read(&task->signal->exec_update_lock); @@ -297,24 +302,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) { - unsigned long *i; - darray_for_each(*stack, i) { prt_printf(out, "[<0>] %pB", (void *) *i); prt_newline(out); } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); darray_exit(&stack); return ret; } +#ifndef __KERNEL__ +#include <time.h> +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + strim(buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + /* time stats: */ #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -359,6 +414,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; bch2_quantiles_update(&stats->quantiles, duration); } @@ -373,29 +429,33 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, stats->last_event = end; } +static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + for (struct bch2_time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, struct bch2_time_stat_buffer *b) { - struct bch2_time_stat_buffer_entry *i; unsigned long flags; spin_lock_irqsave(&stats->lock, flags); - for (i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); + __bch2_time_stats_clear_buffer(stats, b); spin_unlock_irqrestore(&stats->lock, flags); - - b->nr = 0; } void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, - "time_stats: min_duration = %llu, min_freq = %llu", - stats->min_duration, stats->min_freq); + WARN_ONCE(!stats->duration_stats_weighted.weight || + !stats->freq_stats_weighted.weight, + "uninitialized time_stats"); if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); @@ -424,40 +484,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) preempt_enable(); } } -#endif - -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = pick_time_units(ns); - - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -} static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { @@ -468,26 +494,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } -#ifndef __KERNEL__ -#include <time.h> -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -#define TABSTOP_SIZE 12 - static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { prt_str(out, name); @@ -496,12 +502,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 prt_newline(out); } +#define TABSTOP_SIZE 12 + void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + /* * avoid divide by zero */ @@ -547,6 +565,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); + pr_name_and_units(out, "total:", stats->total_duration); prt_printf(out, "mean:"); prt_tab(out); @@ -604,6 +623,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats last_q = q; } } +#else +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} +#endif void bch2_time_stats_exit(struct bch2_time_stats *stats) { @@ -1158,3 +1180,39 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } + +void bch2_darray_str_exit(darray_str *d) +{ + darray_for_each(*d, i) + kfree(*i); + darray_exit(d); +} + +int bch2_split_devs(const char *_dev_name, darray_str *ret) +{ + darray_init(ret); + + char *dev_name, *s, *orig; + + dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return -ENOMEM; + + while ((s = strsep(&dev_name, ":"))) { + char *p = kstrdup(s, GFP_KERNEL); + if (!p) + goto err; + + if (darray_push(ret, p)) { + kfree(p); + goto err; + } + } + + kfree(orig); + return 0; +err: + bch2_darray_str_exit(ret); + kfree(orig); + return -ENOMEM; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b701f7fe07..b414736d59 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -342,14 +342,24 @@ bool bch2_is_zero(const void *, size_t); u64 bch2_read_flag_list(char *, const char * const[]); -void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); + +static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) +{ +#ifdef __KERNEL__ + prt_printf(out, "%pg", bdev); +#else + prt_str(out, bdev->name); +#endif +} #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) @@ -374,8 +384,9 @@ struct bch2_time_stat_buffer { struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ - u64 max_duration; u64 min_duration; + u64 max_duration; + u64 total_duration; u64 max_freq; u64 min_freq; u64 last_event; @@ -390,15 +401,39 @@ struct bch2_time_stats { #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -#endif static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + if (v != !!*start) { + if (!v) { + bch2_time_stats_update(stats, *start); + *start = 0; + } else { + *start = local_clock() ?: 1; + return true; + } + } + + return false; +} +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + bool ret = v && !*start; + *start = v; + return ret; +} +#endif + void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); @@ -831,4 +866,14 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +static inline bool qstr_eq(const struct qstr l, const struct qstr r) +{ + return l.len == r.len && !memcmp(l.name, r.name, l.len); +} + +void bch2_darray_str_exit(darray_str *); +int bch2_split_devs(const char *, darray_str *); + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h index a6561b4b36..2ad338e282 100644 --- a/fs/bcachefs/vstructs.h +++ b/fs/bcachefs/vstructs.h @@ -48,14 +48,14 @@ ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_for_each(_s, _i) \ - for (_i = (_s)->start; \ + for (typeof(&(_s)->start[0]) _i = (_s)->start; \ _i < vstruct_last(_s); \ _i = vstruct_next(_i)) -#define vstruct_for_each_safe(_s, _i, _t) \ - for (_i = (_s)->start; \ - _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ - _i = _t) +#define vstruct_for_each_safe(_s, _i) \ + for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ + _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ + _i = _next) #define vstruct_idx(_s, _idx) \ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5a1858fb98..9c0d231603 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -590,8 +590,9 @@ err: mutex_unlock(&inode->ei_update_lock); if (value && - (opt_id == Opt_background_compression || - opt_id == Opt_background_target)) + (opt_id == Opt_background_target || + opt_id == Opt_background_compression || + (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h new file mode 100644 index 0000000000..e9f8105395 --- /dev/null +++ b/fs/bcachefs/xattr_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_FORMAT_H +#define _BCACHEFS_XATTR_FORMAT_H + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_XATTR_FORMAT_H */ |