diff options
Diffstat (limited to '')
-rw-r--r-- | fs/bcachefs/recovery.c | 304 |
1 files changed, 184 insertions, 120 deletions
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 5cf7d05320..21e13bb433 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + if (k->overwritten) + return 0; + + trans->journal_res.seq = k->journal_seq; + /* * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing @@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) static int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key **keys_sorted, *k; + DARRAY(struct journal_key *) keys_sorted = { 0 }; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - size_t i; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); - if (!keys_sorted) - return -BCH_ERR_ENOMEM_journal_replay; - - for (i = 0; i < keys->nr; i++) - keys_sorted[i] = &keys->d[i]; - - sort(keys_sorted, keys->nr, - sizeof(keys_sorted[0]), - journal_sort_seq_cmp, NULL); - if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); @@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; + /* + * First, attempt to replay keys in sorted order. This is more + * efficient - better locality of btree access - but some might fail if + * that would cause a journal deadlock. + */ + for (size_t i = 0; i < keys->nr; i++) { + cond_resched(); + + struct journal_key *k = keys->d + i; + + /* Skip fastpath if we're low on space in the journal */ + ret = c->journal.watermark ? -1 : + commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + bch2_journal_replay_key(trans, k)); + BUG_ON(!ret && !k->overwritten); + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) + goto err; + } + } + /* + * Now, replay any remaining keys in the order in which they appear in + * the journal, unpinning those journal entries as we go: + */ + sort(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); + + darray_for_each(keys_sorted, kp) { cond_resched(); + struct journal_key *k = *kp; + replay_now_at(j, k->journal_seq); - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim - : 0), + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + (!k->allocated + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), bch2_journal_replay_key(trans, k)); - if (ret) { - bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + bch_err_msg(c, ret, "while replaying key at btree %s level %u:", + bch2_btree_id_str(k->btree_id), k->level); + if (ret) goto err; - } + + BUG_ON(!k->overwritten); } + /* + * We need to put our btree_trans before calling flush_all_pins(), since + * that will use a btree_trans internally + */ + bch2_trans_put(trans); + trans = NULL; + if (!c->opts.keep_journal) bch2_journal_keys_put_initial(c); @@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c) j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); - if (keys->nr && !ret) + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: - kvfree(keys_sorted); - - if (ret) - bch_err_fn(c, ret); + if (trans) + bch2_trans_put(trans); + darray_exit(&keys_sorted); + bch_err_fn(c, ret); return ret; } @@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, @@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); @@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c, static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean) { - struct jset_entry *entry; - int ret; - if (clean) { - for (entry = clean->start; + for (struct jset_entry *entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c, continue; vstruct_for_each(&i->j, entry) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -474,10 +497,9 @@ err: noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, __bch2_fs_upgrade_for_subvolumes(trans)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c) static int bch2_set_may_go_rw(struct bch_fs *c) { - set_bit(BCH_FS_MAY_GO_RW, &c->flags); + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); return 0; } @@ -542,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v) static bool check_version_upgrade(struct bch_fs *c) { - unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; + unsigned latest_compatible = min(latest_version, + bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; @@ -562,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c) new_version = latest_version; break; case BCH_VERSION_UPGRADE_none: - new_version = old_version; + new_version = min(old_version, latest_version); break; } } @@ -589,17 +625,15 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, new_version); prt_newline(&buf); - u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); - if (recovery_passes) { - if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) - prt_str(&buf, "fsck required"); - else { - prt_str(&buf, "running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); - } - - c->recovery_passes_explicit |= recovery_passes; - c->opts.fix_errors = FSCK_FIX_yes; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_upgrade(c, old_version, new_version); + passes = ext->recovery_passes_required[0] & ~passes; + + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } bch_info(c, "%s", buf.buf); @@ -625,7 +659,7 @@ u64 bch2_fsck_recovery_passes(void) static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -642,39 +676,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; int ret; - c->curr_recovery_pass = pass; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); - if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + return 0; +} - if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - printk(KERN_CONT " done\n"); +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; - c->recovery_passes_complete |= BIT_ULL(pass); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); } - return 0; + return ret; } -static int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; continue; + } if (ret) break; - c->curr_recovery_pass++; } return ret; @@ -718,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->opts.nochanges && c->opts.norecovery)) { + if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); bool write_sb = false; @@ -748,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (bch2_check_version_downgrade(c)) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "Version downgrade required:\n"); + prt_str(&buf, "Version downgrade required:"); __le64 passes = ext->recovery_passes_required[0]; bch2_sb_set_downgrade(c, @@ -756,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c) BCH_VERSION_MINOR(c->sb.version)); passes = ext->recovery_passes_required[0] & ~passes; if (passes) { - prt_str(&buf, " running recovery passes: "); + prt_str(&buf, "\n running recovery passes: "); prt_bitflags(&buf, bch2_recovery_passes, bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } @@ -779,6 +836,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + if (c->opts.fsck) + set_bit(BCH_FS_fsck_running, &c->flags); + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -919,13 +979,17 @@ use_clean: if (ret) goto err; + clear_bit(BCH_FS_fsck_running, &c->flags); + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && - !test_bit(BCH_FS_ERROR, &c->flags)) { + test_bit(BCH_FS_errors_fixed, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags) && + !test_bit(BCH_FS_error, &c->flags)) { + bch2_flush_fsck_errs(c); + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + clear_bit(BCH_FS_errors_fixed, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; @@ -933,13 +997,13 @@ use_clean: if (ret) goto err; - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || - test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { @@ -958,13 +1022,13 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags) && + if (!test_bit(BCH_FS_error, &c->flags) && !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_error, &c->flags)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (ext && (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || @@ -976,8 +1040,8 @@ use_clean: } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); write_sb = true; @@ -993,8 +1057,12 @@ use_clean: bch2_move_stats_init(&stats, "recovery"); - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c) ?: + struct printbuf buf = PRINTBUF; + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); + printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; @@ -1007,7 +1075,6 @@ use_clean: ret = 0; out: - set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && @@ -1015,13 +1082,14 @@ out: bch2_journal_keys_put_initial(c); kfree(clean); - if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + if (!ret && + test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && + !c->opts.nochanges) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; err: fsck_err: @@ -1034,8 +1102,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - struct bch_dev *ca; - unsigned i; int ret; bch_notice(c, "initializing new filesystem"); @@ -1054,13 +1120,12 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_usage_init(ca); ret = bch2_fs_journal_alloc(c); @@ -1088,7 +1153,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) ca->new_fs_bucket_idx = 0; ret = bch2_fs_freespace_init(c); @@ -1112,10 +1177,9 @@ int bch2_fs_initialize(struct bch_fs *c) packed_inode.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "creating root directory"); + bch_err_msg(c, ret, "creating root directory"); + if (ret) goto err; - } bch2_inode_init_early(c, &lostfound_inode); @@ -1126,10 +1190,11 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { 0 }, 0)); - if (ret) { - bch_err_msg(c, ret, "creating lost+found"); + bch_err_msg(c, ret, "creating lost+found"); + if (ret) goto err; - } + + c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1138,10 +1203,9 @@ int bch2_fs_initialize(struct bch_fs *c) } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err_msg(c, ret, "writing first journal entry"); + bch_err_msg(c, ret, "writing first journal entry"); + if (ret) goto err; - } mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); @@ -1152,6 +1216,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - bch_err_fn(ca, ret); + bch_err_fn(c, ret); return ret; } |