summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:27 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:27 +0000
commit34996e42f82bfd60bc2c191e5cae3c6ab233ec6c (patch)
tree62db60558cbf089714b48daeabca82bf2b20b20e /fs/bcachefs
parentAdding debian version 6.8.12-1. (diff)
downloadlinux-34996e42f82bfd60bc2c191e5cae3c6ab233ec6c.tar.xz
linux-34996e42f82bfd60bc2c191e5cae3c6ab233ec6c.zip
Merging upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Makefile7
-rw-r--r--fs/bcachefs/acl.c30
-rw-r--r--fs/bcachefs/alloc_background.c285
-rw-r--r--fs/bcachefs/alloc_background.h9
-rw-r--r--fs/bcachefs/alloc_foreground.c27
-rw-r--r--fs/bcachefs/alloc_types.h3
-rw-r--r--fs/bcachefs/backpointers.c337
-rw-r--r--fs/bcachefs/backpointers.h49
-rw-r--r--fs/bcachefs/bbpos_types.h2
-rw-r--r--fs/bcachefs/bcachefs.h33
-rw-r--r--fs/bcachefs/bcachefs_format.h88
-rw-r--r--fs/bcachefs/bkey.h211
-rw-r--r--fs/bcachefs/bkey_methods.c8
-rw-r--r--fs/bcachefs/bkey_types.h213
-rw-r--r--fs/bcachefs/bset.c14
-rw-r--r--fs/bcachefs/bset.h2
-rw-r--r--fs/bcachefs/btree_cache.c109
-rw-r--r--fs/bcachefs/btree_gc.c684
-rw-r--r--fs/bcachefs/btree_io.c65
-rw-r--r--fs/bcachefs/btree_iter.c72
-rw-r--r--fs/bcachefs/btree_iter.h11
-rw-r--r--fs/bcachefs/btree_journal_iter.c279
-rw-r--r--fs/bcachefs/btree_journal_iter.h22
-rw-r--r--fs/bcachefs/btree_key_cache.c49
-rw-r--r--fs/bcachefs/btree_locking.c31
-rw-r--r--fs/bcachefs/btree_node_scan.c524
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h31
-rw-r--r--fs/bcachefs/btree_trans_commit.c39
-rw-r--r--fs/bcachefs/btree_types.h25
-rw-r--r--fs/bcachefs/btree_update.c29
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c498
-rw-r--r--fs/bcachefs/btree_update_interior.h34
-rw-r--r--fs/bcachefs/btree_write_buffer.c34
-rw-r--r--fs/bcachefs/buckets.c48
-rw-r--r--fs/bcachefs/buckets.h9
-rw-r--r--fs/bcachefs/chardev.c135
-rw-r--r--fs/bcachefs/checksum.c26
-rw-r--r--fs/bcachefs/checksum.h5
-rw-r--r--fs/bcachefs/compress.c14
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/data_update.c29
-rw-r--r--fs/bcachefs/debug.c83
-rw-r--r--fs/bcachefs/dirent.c143
-rw-r--r--fs/bcachefs/dirent.h6
-rw-r--r--fs/bcachefs/ec.c64
-rw-r--r--fs/bcachefs/ec.h2
-rw-r--r--fs/bcachefs/errcode.c15
-rw-r--r--fs/bcachefs/errcode.h20
-rw-r--r--fs/bcachefs/error.c14
-rw-r--r--fs/bcachefs/error.h12
-rw-r--r--fs/bcachefs/extents.c70
-rw-r--r--fs/bcachefs/extents.h42
-rw-r--r--fs/bcachefs/eytzinger.c234
-rw-r--r--fs/bcachefs/eytzinger.h81
-rw-r--r--fs/bcachefs/fifo.h4
-rw-r--r--fs/bcachefs/fs-common.c74
-rw-r--r--fs/bcachefs/fs-io-buffered.c149
-rw-r--r--fs/bcachefs/fs-io-direct.c23
-rw-r--r--fs/bcachefs/fs-io-pagecache.h9
-rw-r--r--fs/bcachefs/fs-io.c16
-rw-r--r--fs/bcachefs/fs.c231
-rw-r--r--fs/bcachefs/fsck.c1124
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c59
-rw-r--r--fs/bcachefs/inode.h19
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c2
-rw-r--r--fs/bcachefs/io_write.c48
-rw-r--r--fs/bcachefs/journal.c286
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c483
-rw-r--r--fs/bcachefs/journal_io.h47
-rw-r--r--fs/bcachefs/journal_reclaim.c31
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c72
-rw-r--r--fs/bcachefs/journal_types.h31
-rw-r--r--fs/bcachefs/logged_ops.c11
-rw-r--r--fs/bcachefs/lru.c7
-rw-r--r--fs/bcachefs/mean_and_variance.c28
-rw-r--r--fs/bcachefs/mean_and_variance.h14
-rw-r--r--fs/bcachefs/mean_and_variance_test.c80
-rw-r--r--fs/bcachefs/migrate.c8
-rw-r--r--fs/bcachefs/move.c22
-rw-r--r--fs/bcachefs/movinggc.c3
-rw-r--r--fs/bcachefs/opts.c41
-rw-r--r--fs/bcachefs/opts.h31
-rw-r--r--fs/bcachefs/quota.c8
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c490
-rw-r--r--fs/bcachefs/recovery.h32
-rw-r--r--fs/bcachefs/recovery_passes.c249
-rw-r--r--fs/bcachefs/recovery_passes.h17
-rw-r--r--fs/bcachefs/recovery_passes_types.h (renamed from fs/bcachefs/recovery_types.h)15
-rw-r--r--fs/bcachefs/reflink.c3
-rw-r--r--fs/bcachefs/replicas.c19
-rw-r--r--fs/bcachefs/sb-clean.c38
-rw-r--r--fs/bcachefs/sb-downgrade.c15
-rw-r--r--fs/bcachefs/sb-errors_types.h28
-rw-r--r--fs/bcachefs/sb-members.c59
-rw-r--r--fs/bcachefs/sb-members.h25
-rw-r--r--fs/bcachefs/snapshot.c251
-rw-r--r--fs/bcachefs/snapshot.h89
-rw-r--r--fs/bcachefs/str_hash.h15
-rw-r--r--fs/bcachefs/subvolume.c259
-rw-r--r--fs/bcachefs/subvolume.h11
-rw-r--r--fs/bcachefs/subvolume_format.h4
-rw-r--r--fs/bcachefs/subvolume_types.h2
-rw-r--r--fs/bcachefs/super-io.c119
-rw-r--r--fs/bcachefs/super.c145
-rw-r--r--fs/bcachefs/super_types.h4
-rw-r--r--fs/bcachefs/sysfs.c21
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/bcachefs/thread_with_file.c402
-rw-r--r--fs/bcachefs/thread_with_file.h62
-rw-r--r--fs/bcachefs/thread_with_file_types.h15
-rw-r--r--fs/bcachefs/time_stats.c165
-rw-r--r--fs/bcachefs/time_stats.h159
-rw-r--r--fs/bcachefs/trace.h19
-rw-r--r--fs/bcachefs/util.c370
-rw-r--r--fs/bcachefs/util.h167
-rw-r--r--fs/bcachefs/xattr.c5
122 files changed, 7373 insertions, 3806 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7..66ca0bbee6 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y := \
btree_journal_iter.o \
btree_key_cache.o \
btree_locking.o \
+ btree_node_scan.o \
btree_trans_commit.o \
btree_update.o \
btree_update_interior.o \
@@ -37,6 +38,7 @@ bcachefs-y := \
error.o \
extents.o \
extent_update.o \
+ eytzinger.o \
fs.o \
fs-common.o \
fs-ioctl.o \
@@ -67,6 +69,7 @@ bcachefs-y := \
quota.o \
rebalance.o \
recovery.o \
+ recovery_passes.o \
reflink.o \
replicas.o \
sb-clean.o \
@@ -82,6 +85,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ time_stats.o \
thread_with_file.o \
trace.o \
two_state_shared_lock.o \
@@ -90,3 +94,6 @@ bcachefs-y := \
xattr.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
+
+# Silence "note: xyz changed in GCC X.X" messages
+subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cc..5c180fdc3e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
@@ -290,28 +289,27 @@ retry:
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- if (!IS_ERR(acl))
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d83..534ba2b02b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,6 +29,8 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -242,10 +244,10 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
@@ -530,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
u8 gen = bch2_alloc_to_v4(k, &a)->gen;
unsigned offset;
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+ int ret2 = 0;
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
- if (ret)
- break;
+ ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret2)
+ goto iter_err;
have_bucket_gens_key = false;
}
@@ -547,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- 0;
+iter_err:
+ ret2;
}));
if (have_bucket_gens_key && !ret)
@@ -850,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
bucket_journal_seq);
if (ret) {
bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
+ "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
return ret;
}
}
@@ -860,23 +863,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
*bucket_gen(ca, new.k->p.offset) = new_a->gen;
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+ percpu_up_read(&c->mark_lock);
+
+#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ if (statechange(a->data_type == BCH_DATA_free) &&
+ bucket_flushed(new_a))
closure_wake_up(&c->freelist_wait);
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
+ if (statechange(a->data_type == BCH_DATA_need_discard) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+ bucket_flushed(new_a))
+ bch2_discard_one_bucket_fast(c, new.k->p);
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
+ if (statechange(a->data_type == BCH_DATA_cached) &&
+ !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
bch2_do_invalidates(c);
- if (new_a->data_type == BCH_DATA_need_gc_gens)
+ if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_do_gc_gens(c);
- percpu_up_read(&c->mark_lock);
}
if ((flags & BTREE_TRIGGER_GC) &&
@@ -1045,14 +1053,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != discard_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, need_discard_key_wrong,
- "incorrect key in need_discard btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[discard_key_type],
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != discard_key_type,
+ c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1076,15 +1083,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != freespace_key_type &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_key_wrong,
- "incorrect key in freespace btree (got %s should be %s)\n"
- " %s",
- bch2_bkey_types[k.k->type],
- bch2_bkey_types[freespace_key_type],
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(k.k->type != freespace_key_type,
+ c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1108,14 +1114,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
- if (a->gen != alloc_gen(k, gens_offset) &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+ c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@@ -1167,14 +1172,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
- if (k.k->type != KEY_TYPE_set &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset))) {
+ if (fsck_err_on(k.k->type != KEY_TYPE_set,
+ c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@@ -1604,6 +1608,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
+static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+{
+ int ret;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = darray_push(&c->discard_buckets_in_flight, bucket);
+out:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+ return ret;
+}
+
+static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+{
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i)
+ if (bkey_eq(*i, bucket)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ goto found;
+ }
+ BUG();
+found:
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+}
+
struct discard_buckets_state {
u64 seen;
u64 open;
@@ -1642,6 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
+ bool discard_locked = false;
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1678,37 +1713,45 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
- a->v.gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- goto write;
- }
-
- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
- "%s",
- a->v.journal_seq,
- c->journal.flushed_seq_ondisk,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ if (a->v.dirty_sectors) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "attempting to discard bucket with dirty data\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = -EIO;
- }
goto out;
}
if (a->v.data_type != BCH_DATA_need_discard) {
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
- bch2_trans_inconsistent(trans,
- "bucket incorrectly set in need_discard btree\n"
- "%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EIO;
+ if (data_type_is_empty(a->v.data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
}
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
goto out;
}
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+ trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = -EIO;
+ goto out;
+ }
+
+ if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ goto out;
+
+ discard_locked = true;
+
if (!bkey_eq(*discard_pos_done, iter.pos) &&
ca->mi.discard && !c->opts.nochanges) {
/*
@@ -1740,6 +1783,8 @@ write:
count_event(c, bucket_discard);
s->discarded++;
out:
+ if (discard_locked)
+ discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
@@ -1779,6 +1824,94 @@ void bch2_do_discards(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
+static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+{
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+
+ BUG_ON(a->v.dirty_sectors);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+
+ while (1) {
+ bool got_bucket = false;
+ struct bpos bucket;
+ struct bch_dev *ca;
+
+ mutex_lock(&c->discard_buckets_in_flight_lock);
+ darray_for_each(c->discard_buckets_in_flight, i) {
+ if (i->snapshot)
+ continue;
+
+ ca = bch_dev_bkey_exists(c, i->inode);
+
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ darray_remove_item(&c->discard_buckets_in_flight, i);
+ continue;
+ }
+
+ got_bucket = true;
+ bucket = *i;
+ i->snapshot = true;
+ break;
+ }
+ mutex_unlock(&c->discard_buckets_in_flight_lock);
+
+ if (!got_bucket)
+ break;
+
+ if (ca->mi.discard && !c->opts.nochanges)
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+
+ int ret = bch2_trans_do(c, NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, bucket));
+ bch_err_fn(c, ret);
+
+ percpu_ref_put(&ca->io_ref);
+ discard_in_flight_remove(c, bucket);
+
+ if (ret)
+ break;
+ }
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+ if (!percpu_ref_is_dying(&ca->io_ref) &&
+ !discard_in_flight_add(c, bucket) &&
+ bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
+ !queue_work(c->write_ref_wq, &c->discard_fast_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
static int invalidate_one_bucket(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
@@ -1813,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
goto out;
BUG_ON(a->v.data_type != BCH_DATA_cached);
+ BUG_ON(a->v.dirty_sectors);
if (!a->v.cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
@@ -2210,9 +2344,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
+void bch2_fs_allocator_background_exit(struct bch_fs *c)
+{
+ darray_exit(&c->discard_buckets_in_flight);
+}
+
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ mutex_init(&c->discard_buckets_in_flight_lock);
INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee..2790e51638 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -126,13 +126,17 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
return pos;
}
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
{
- unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
BCH_ALLOC_V4_U64s_V0) +
BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
(sizeof(struct bch_backpointer) / sizeof(u64));
+}
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = alloc_v4_u64s_noerror(a);
BUG_ON(ret > U8_MAX - BKEY_U64s);
return ret;
}
@@ -269,6 +273,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_fs_allocator_background_exit(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b3..a1fc30adf9 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
switch (watermark) {
- case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
case BCH_WATERMARK_btree:
case BCH_WATERMARK_btree_copygc:
return OPEN_BUCKETS_COUNT / 4;
@@ -236,8 +238,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -263,11 +264,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, false);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -555,8 +553,7 @@ again:
goto again;
}
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -1361,15 +1358,17 @@ retry:
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, watermark,
flags, cl);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ if (!ret2 ||
+ bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+ ret = ret2;
goto alloc_done;
+ }
}
/*
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a4610..c2226e947c 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
x(copygc) \
x(btree) \
x(btree_copygc) \
- x(reclaim)
+ x(reclaim) \
+ x(interior_updates)
enum bch_watermark {
#define x(name) BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 569b97904d..af7a71de1b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "checksum.h"
#include "error.h"
#include <linux/mm.h>
@@ -29,8 +30,7 @@ static bool extent_matches_bp(struct bch_fs *c,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
- &bucket2, &bp2);
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2);
if (bpos_eq(bucket, bucket2) &&
!memcmp(&bp, &bp2, sizeof(bp)))
return true;
@@ -44,13 +44,20 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ /* these will be caught by fsck */
+ if (!bch2_dev_exists2(c, bp.k->p.inode))
+ return 0;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
- bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+ !bpos_eq(bp.k->p, bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset)),
c, err,
- backpointer_pos_wrong,
- "backpointer at wrong pos");
+ backpointer_bucket_offset_wrong,
+ "backpointer bucket_offset wrong");
fsck_err:
return ret;
}
@@ -131,8 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_exit(&buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- bch2_inconsistent_error(c);
- return -EIO;
+ return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
} else {
return 0;
}
@@ -379,7 +385,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
@@ -415,6 +421,84 @@ struct extents_to_bp_state {
struct bkey_buf last_flushed;
};
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+ struct bkey_s_c extent, unsigned dev)
+{
+ struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+ return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_s_c extent,
+ enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ void *data_buf = NULL;
+ struct bio *bio = NULL;
+ size_t bytes;
+ int ret = 0;
+
+ if (bkey_is_btree_ptr(extent.k))
+ return false;
+
+ bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+ if (p.ptr.dev == dev)
+ goto found;
+ BUG();
+found:
+ if (!p.crc.csum_type)
+ return false;
+
+ bytes = p.crc.compressed_size << 9;
+
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+ if (!bch2_dev_get_ioref(ca, READ))
+ return false;
+
+ data_buf = kvmalloc(bytes, GFP_KERNEL);
+ if (!data_buf) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
+ bio->bi_iter.bi_sector = p.ptr.offset;
+ bch2_bio_map(bio, data_buf, bytes);
+ ret = submit_bio_wait(bio);
+ if (ret)
+ goto err;
+
+ prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree));
+ bch2_bkey_val_to_text(&buf, c, extent);
+ prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
+ bch2_bkey_val_to_text(&buf, c, extent2);
+
+ struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+ if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+ c, dup_backpointer_to_bad_csum_extent,
+ "%s", buf.buf))
+ ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+ if (bio)
+ bio_put(bio);
+ kvfree(data_buf);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int check_bp_exists(struct btree_trans *trans,
struct extents_to_bp_state *s,
struct bpos bucket,
@@ -422,7 +506,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
+ struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
struct bkey_buf tmp;
@@ -430,13 +515,19 @@ static int check_bp_exists(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "extent for nonexistent device:bucket ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ bch_err(c, "%s", buf.buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
if (bpos_lt(bucket, s->bucket_start) ||
bpos_gt(bucket, s->bucket_end))
return 0;
- if (!bch2_dev_bucket_exists(c, bucket))
- goto missing;
-
bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp(c, bucket, bp.bucket_offset),
0);
@@ -462,24 +553,96 @@ static int check_bp_exists(struct btree_trans *trans,
ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
- goto missing;
+
+ goto check_existing_bp;
}
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
+check_existing_bp:
+ /* Do we have a backpointer for a different extent? */
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ goto missing;
+
+ struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+ struct bkey_s_c other_extent =
+ bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+ ret = bkey_err(other_extent);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ ret = 0;
+ if (ret)
+ goto err;
+
+ if (!other_extent.k)
+ goto missing;
+
+ if (bch2_extents_match(orig_k, other_extent)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+
+ if (other_extent.k->size <= orig_k.k->size) {
+ ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret)
+ goto err;
+ goto out;
+ } else {
+ ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+ if (ret)
+ goto err;
+ goto missing;
+ }
+ }
+
+ ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto missing;
+ }
+
+ ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_str(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, other_extent);
+ bch_err(c, "%s", buf.buf);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
missing:
+ printbuf_reset(&buf);
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nbp pos ");
- bch2_bpos_to_text(&buf, bp_iter.pos);
+ prt_printf(&buf, "\n got: ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+
+ struct bkey_i_backpointer n_bp_k;
+ bkey_backpointer_init(&n_bp_k.k_i);
+ n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ n_bp_k.v = bp;
+ prt_printf(&buf, "\n want: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+ if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@@ -504,8 +667,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, btree, level,
- k, p, &bucket_pos, &bp);
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp);
ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
@@ -555,60 +717,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
};
}
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
- u64 mem_bytes;
-
si_meminfo(&i);
- mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+ u64 mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- unsigned btree_leaf_mask,
- unsigned btree_interior_mask,
+ u64 btree_leaf_mask,
+ u64 btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- enum btree_id btree;
+ struct bch_fs *c = trans->c;
+ s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
- for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
- unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+ btree_interior_mask |= btree_leaf_mask;
+
+ c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_start = start;
+ c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
+
+ for (enum btree_id btree = start.btree;
+ btree < BTREE_ID_NR && !ret;
+ btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+ struct btree_iter iter;
+ struct btree *b;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
continue;
- bch2_trans_node_iter_init(trans, &iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, 0);
- /*
- * for_each_btree_key_contineu() doesn't check the return value
- * from bch2_btree_iter_advance(), which is needed when
- * iterating over interior nodes where we'll see keys at
- * SPOS_MAX:
- */
- do {
- k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
- ret = bkey_err(k);
- if (!k.k || ret)
- break;
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = BBPOS(btree, k.k->p);
+ __for_each_btree_node(trans, iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
+ mem_may_pin -= btree_buf_bytes(b);
+ if (mem_may_pin <= 0) {
+ c->btree_cache.pinned_nodes_end = *end =
+ BBPOS(btree, b->key.k.p);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
- } while (bch2_btree_iter_advance(&iter));
+ }
bch2_trans_iter_exit(trans, &iter);
}
- *end = BBPOS_MAX;
return ret;
}
@@ -666,62 +829,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
- struct bpos bucket)
-{
- return bch2_dev_exists2(c, bucket.inode)
- ? bucket_pos_to_bp(c, bucket, 0)
- : bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
-{
- struct btree_iter alloc_iter;
- struct btree_iter bp_iter;
- struct bkey_s_c alloc_k, bp_k;
- size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
- bool alloc_end = false, bp_end = false;
- int ret = 0;
-
- bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
- start, 0, 1, 0);
- bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
- while (1) {
- alloc_k = !alloc_end
- ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
- : bkey_s_c_null;
- bp_k = !bp_end
- ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
- : bkey_s_c_null;
-
- ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
- if ((!alloc_k.k && !bp_k.k) || ret) {
- *end = SPOS_MAX;
- break;
- }
-
- --btree_nodes;
- if (!btree_nodes) {
- *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
- break;
- }
-
- if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
- bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
- if (!bch2_btree_iter_advance(&alloc_iter))
- alloc_end = true;
- } else {
- if (!bch2_btree_iter_advance(&bp_iter))
- bp_end = true;
- }
- }
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
-}
-
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -732,10 +839,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bkey_init(&s.last_flushed.k->k);
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+ struct bbpos end;
+ ret = bch2_get_btree_in_memory_pos(trans,
+ BIT_ULL(BTREE_ID_backpointers),
+ BIT_ULL(BTREE_ID_backpointers),
+ BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
if (ret)
break;
+ s.bucket_end = end.pos;
+
if ( bpos_eq(s.bucket_start, POS_MIN) &&
!bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -763,6 +876,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
@@ -868,6 +984,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
+ c->btree_cache.pinned_nodes_leaf_mask = 0;
+ c->btree_cache.pinned_nodes_interior_mask = 0;
+
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9fe..c1b274eadd 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -45,6 +45,15 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
}
+static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ return POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+}
+
/*
* Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
*/
@@ -53,14 +62,8 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
u64 bucket_offset)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
- struct bpos ret;
-
- ret = POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-
+ struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
return ret;
}
@@ -90,20 +93,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry)
{
- return level ? BCH_DATA_btree :
- p.has_ec ? BCH_DATA_stripe :
- BCH_DATA_user;
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return BCH_DATA_btree;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ case KEY_TYPE_stripe: {
+ const struct bch_extent_ptr *ptr = &entry->ptr;
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ BUG_ON(ptr < s.v->ptrs ||
+ ptr >= s.v->ptrs + s.v->nr_blocks);
+
+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity
+ : BCH_DATA_user;
+ }
+ default:
+ BUG();
+ }
}
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
struct bpos *bucket_pos, struct bch_backpointer *bp)
{
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3..f63893344f 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50..91c3c1fef2 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,9 +209,10 @@
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -266,6 +267,9 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
__printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
+__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);
#define maybe_dev_to_fs(_c) _Generic((_c), \
@@ -452,6 +456,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
+#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
@@ -504,6 +509,7 @@ enum gc_phase {
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
+ GC_PHASE_BTREE_subvolume_children,
GC_PHASE_PENDING_DELETE,
};
@@ -593,7 +599,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct bch2_time_stats io_latency[2];
+ struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@@ -609,6 +615,7 @@ struct bch_dev {
*/
#define BCH_FS_FLAGS() \
+ x(new_fs) \
x(started) \
x(may_go_rw) \
x(rw) \
@@ -663,6 +670,8 @@ struct journal_seq_blacklist_table {
};
struct journal_keys {
+ /* must match layout in darray_types.h */
+ size_t nr, size;
struct journal_key {
u64 journal_seq;
u32 journal_offset;
@@ -671,15 +680,13 @@ struct journal_keys {
bool allocated;
bool overwritten;
struct bkey_i *k;
- } *d;
+ } *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
- size_t nr;
- size_t size;
atomic_t ref;
bool initial_ref_held;
};
@@ -702,7 +709,10 @@ struct btree_trans_buf {
x(stripe_delete) \
x(reflink) \
x(fallocate) \
+ x(fsync) \
+ x(dio_write) \
x(discard) \
+ x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
@@ -790,6 +800,7 @@ struct bch_fs {
u64 features;
u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ u64 btrees_lost_data;
} sb;
@@ -804,7 +815,6 @@ struct bch_fs {
/* snapshot.c: */
struct snapshot_table __rcu *snapshots;
- size_t snapshot_table_size;
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
@@ -843,6 +853,8 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
+ struct workqueue_struct *btree_node_rewrite_worker;
+
struct list_head pending_node_rewrites;
struct mutex pending_node_rewrites_lock;
@@ -919,8 +931,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
- u64 blocked_allocate;
- u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@@ -940,8 +950,11 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct discard_work;
struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct bpos) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
@@ -1095,6 +1108,8 @@ struct bch_fs {
struct journal_keys journal_keys;
struct list_head journal_iters;
+ struct find_btree_nodes found_btree_nodes;
+
u64 last_bucket_seq_cleanup;
u64 counters_on_mount[BCH_COUNTER_NR];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a2..2e8b1a489c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -189,7 +189,11 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
struct bkey {
/* Size of combined key and value, in u64s */
@@ -222,7 +226,36 @@ struct bkey {
__u8 pad[1];
#endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
struct bkey_packed {
__u64 _data[0];
@@ -545,7 +578,8 @@ struct bch_member {
__le64 nbuckets; /* device size */
__le16 first_bucket; /* index of first bucket used */
__le16 bucket_size; /* sectors */
- __le32 pad;
+ __u8 btree_bitmap_shift;
+ __u8 pad[3];
__le64 last_mount; /* time_t */
__le64 flags;
@@ -554,8 +588,15 @@ struct bch_member {
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
__le64 seq;
+ __le64 btree_allocated_bitmap;
};
+/*
+ * This limit comes from the bucket_gens array - it's a single allocation, and
+ * kernel allocation are limited to INT_MAX
+ */
+#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
+
#define BCH_MEMBER_V1_BYTES 56
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
@@ -785,6 +826,7 @@ struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
__le64 errors_silent[8];
+ __le64 btrees_lost_data;
};
struct bch_sb_field_downgrade_entry {
@@ -840,7 +882,10 @@ struct bch_sb_field_downgrade {
x(snapshot_skiplists, BCH_VERSION(1, 1)) \
x(deleted_inodes, BCH_VERSION(1, 2)) \
x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4))
+ x(member_seq, BCH_VERSION(1, 4)) \
+ x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
+ x(btree_subvolume_children, BCH_VERSION(1, 6)) \
+ x(mi_btree_bitmap, BCH_VERSION(1, 7))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -858,6 +903,8 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define BCH_SB_SECTOR 8
#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
+#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
+
struct bch_sb_layout {
__uuid_t magic; /* bcachefs superblock UUID */
__u8 layout_type;
@@ -1275,9 +1322,10 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
- x(write_buffer_keys, 11)
+ x(write_buffer_keys, 11) \
+ x(datetime, 12)
-enum {
+enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
@@ -1323,7 +1371,7 @@ struct jset_entry_blacklist_v2 {
x(inodes, 1) \
x(key_version, 2)
-enum {
+enum bch_fs_usage_type {
#define x(f, nr) BCH_FS_USAGE_##f = nr,
BCH_FS_USAGE_TYPES()
#undef x
@@ -1376,6 +1424,11 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
+struct jset_entry_datetime {
+ struct jset_entry entry;
+ __le64 seconds;
+} __packed __aligned(8);
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1459,7 +1512,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)) \
+ BIT_ULL(KEY_TYPE_indirect_inline_data)| \
+ BIT_ULL(KEY_TYPE_error)) \
x(subvolumes, 8, 0, \
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \
@@ -1482,7 +1536,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
+ x(subvolume_children, 19, 0, \
+ BIT_ULL(KEY_TYPE_set))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -1491,6 +1547,20 @@ enum btree_id {
BTREE_ID_NR
};
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 831be01809..3a45d128f6 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -4,7 +4,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
-
+#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@@ -362,10 +311,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
- unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+ return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+}
- EBUG_ON(k->u64s < ret);
- return ret;
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+ const struct bkey_packed *k)
+{
+ return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +505,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764..a275a9e8e3 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -171,11 +171,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
+ (type == BKEY_TYPE_btree || (flags & BKEY_INVALID_COMMIT)) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+ bch2_btree_node_type_str(type),
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0, c, err,
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
new file mode 100644
index 0000000000..c9ae9e42b3
--- /dev/null
+++ b/fs/bcachefs/bkey_types.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i - bkey with inline value
+ * bkey_s - bkey with split value
+ * bkey_s_c - bkey with split value, const
+ */
+
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+ return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+ const struct bkey *k;
+ const struct bch_val *v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+ union {
+ struct {
+ struct bkey *k;
+ struct bch_val *v;
+ };
+ struct bkey_s_c s_c;
+ };
+};
+
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+ return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+ return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...) \
+struct bkey_i_##name { \
+ union { \
+ struct bkey k; \
+ struct bkey_i k_i; \
+ }; \
+ struct bch_##name v; \
+}; \
+ \
+struct bkey_s_c_##name { \
+ union { \
+ struct { \
+ const struct bkey *k; \
+ const struct bch_##name *v; \
+ }; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+struct bkey_s_##name { \
+ union { \
+ struct { \
+ struct bkey *k; \
+ struct bch_##name *v; \
+ }; \
+ struct bkey_s_c_##name c; \
+ struct bkey_s s; \
+ struct bkey_s_c s_c; \
+ }; \
+}; \
+ \
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline const struct bkey_i_##name * \
+bkey_i_to_##name##_c(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return container_of(&k->k, struct bkey_i_##name, k); \
+} \
+ \
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = k.k, \
+ .v = container_of(k.v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{ \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+name##_i_to_s_c(const struct bkey_i_##name *k) \
+{ \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = &k->v, \
+ }; \
+} \
+ \
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_s_c_##name \
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
+{ \
+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
+ return (struct bkey_s_c_##name) { \
+ .k = &k->k, \
+ .v = container_of(&k->v, struct bch_##name, v), \
+ }; \
+} \
+ \
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{ \
+ struct bkey_i_##name *k = \
+ container_of(&_k->k, struct bkey_i_##name, k); \
+ \
+ bkey_init(&k->k); \
+ memset(&k->v, 0, sizeof(k->v)); \
+ k->k.type = KEY_TYPE_##name; \
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
+ \
+ return k; \
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c..3bb477840e 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b,
printbuf_exit(&buf);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
{
struct bset_tree *t;
struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
+ struct btree_nr_keys nr = {};
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
+ return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+ struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa3..120a79fd45 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
/* Accounting: */
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac1..02c70e813f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+ b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_buf_bytes(b));
+ kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
+
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = b->c.level
+ ? bc->pinned_nodes_interior_mask
+ : bc->pinned_nodes_leaf_mask;
+
+ if ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+ kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -696,9 +709,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- u32 seq;
- BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+ if (unlikely(level >= BTREE_MAX_DEPTH)) {
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+ level, BTREE_MAX_DEPTH);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
+ if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+ }
+
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
@@ -711,6 +746,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ if (!path)
+ return b;
+
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -736,33 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
set_btree_node_read_in_flight(b);
-
six_unlock_write(&b->c.lock);
- seq = six_lock_seq(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- /* Unlock before doing IO: */
- if (path && sync)
- bch2_trans_unlock_noassert(trans);
+ if (path) {
+ u32 seq = six_lock_seq(&b->c.lock);
- bch2_btree_node_read(trans, b, sync);
+ /* Unlock before doing IO: */
+ six_unlock_intent(&b->c.lock);
+ bch2_trans_unlock_noassert(trans);
- if (!sync)
- return NULL;
+ bch2_btree_node_read(trans, b, sync);
- if (path) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
+ if (!sync)
+ return NULL;
- if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- if (path)
- trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ b = NULL;
+ } else {
+ bch2_btree_node_read(trans, b, sync);
+ if (lock_type == SIX_LOCK_read)
+ six_lock_downgrade(&b->c.lock);
}
return b;
@@ -791,7 +822,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ bch2_fs_topology_error(c, "%s", buf.buf);
+
printbuf_exit(&buf);
}
@@ -901,7 +933,7 @@ retry:
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -992,7 +1024,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1075,7 +1107,7 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- b = ERR_PTR(-EIO);
+ b = ERR_PTR(-BCH_ERR_btree_node_read_error);
goto out;
}
@@ -1094,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- BUG_ON(trans && !btree_node_locked(path, level + 1));
+ BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_cache_find(bc, k);
+ struct btree *b = btree_cache_find(bc, k);
if (b)
return 0;
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- return PTR_ERR_OR_ZERO(b);
+ if (!IS_ERR_OR_NULL(b))
+ six_unlock_read(&b->c.lock);
+ return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1117,6 +1150,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+
+ BUG_ON(b == btree_node_root(trans->c, b));
wait_on_io:
/* not allowed to wait on io with btree locks held: */
@@ -1128,6 +1163,8 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+ goto out;
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1142,7 +1179,7 @@ wait_on_io:
btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
mutex_unlock(&bc->lock);
-
+out:
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643..791470b0c6 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,11 +7,13 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_gc.h"
@@ -24,7 +26,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "reflink.h"
#include "replicas.h"
#include "super-io.h"
@@ -40,6 +42,7 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+#define DID_FILL_FROM_SCAN 12
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
@@ -70,90 +73,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
- struct btree *b,
- struct bkey_buf *prev,
- struct bkey_buf cur,
- bool is_last)
-{
- struct bpos node_start = b->data->min_key;
- struct bpos node_end = b->data->max_key;
- struct bpos expected_start = bkey_deleted(&prev->k->k)
- ? node_start
- : bpos_successor(prev->k->k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- int ret = 0;
-
- if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
- if (!bpos_eq(expected_start, bp->v.min_key)) {
- bch2_topology_error(c);
-
- if (bkey_deleted(&prev->k->k)) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, node_start);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
- }
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
- }
-
- if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
- bch2_bpos_to_text(&buf2, node_end);
-
- if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
- bch_info(c, "Halting mark and sweep to start topology repair pass");
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
- goto err;
- } else {
- set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
- }
- }
-
- bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@@ -212,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_min);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
struct bkey_i_btree_ptr_v2 *new;
int ret;
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, " -> ");
+ bch2_bpos_to_text(&buf, new_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
if (ret)
return ret;
@@ -268,128 +209,140 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur,
+ struct bpos *pulled_from_scan)
{
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (!prev) {
- prt_printf(&buf1, "start of node: ");
- bch2_bpos_to_text(&buf1, b->data->min_key);
- } else {
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
+
+ if (bpos_eq(expected_start, cur->data->min_key))
+ return 0;
+
+ prt_printf(&buf, " at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ if (prev) {
+ prt_printf(&buf, "\n prev: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
- if (prev &&
- bpos_gt(expected_start, cur->data->min_key) &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
- /* cur overwrites prev: */
-
- if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
- cur->data->min_key), c,
- btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_PREV_NODE;
- goto out;
- }
+ prt_str(&buf, "\n next: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
- if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
- bpos_predecessor(cur->data->min_key)), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " node %s\n"
- " next %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- } else {
- /* prev overwrites cur: */
-
- if (mustfix_fsck_err_on(bpos_ge(expected_start,
- cur->data->max_key), c,
- btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = DROP_THIS_NODE;
- goto out;
- }
+ if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ expected_start,
+ bpos_predecessor(cur->data->min_key));
+ if (ret)
+ goto err;
- if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
- btree_node_topology_bad_min_key,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " node %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf))
- ret = set_node_min(c, cur, expected_start);
+ *pulled_from_scan = cur->data->min_key;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ } else { /* overlap */
+ if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
+ if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ "btree node overwritten by next node%s", buf.buf))
+ ret = DROP_PREV_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ }
+ } else {
+ if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
+ if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ "btree node overwritten by prev node%s", buf.buf))
+ ret = DROP_THIS_NODE;
+ } else {
+ if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key%s", buf.buf))
+ ret = set_node_min(c, cur, expected_start);
+ }
+ }
}
-out:
+err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
- struct btree *child)
+ struct btree *child, struct bpos *pulled_from_scan)
{
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
- bch2_bpos_to_text(&buf2, b->key.k.p);
+ if (bpos_eq(child->key.k.p, b->key.k.p))
+ return 0;
- if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
- btree_node_topology_bad_max_key,
- "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_id_str(b->c.btree_id), b->c.level,
- buf1.buf, buf2.buf)) {
- ret = set_node_max(c, child, b->key.k.p);
- if (ret)
- goto err;
+ prt_printf(&buf, "at btree %s level %u:\n parent: ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_str(&buf, "\n child: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+ if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ "btree node with incorrect max_key%s", buf.buf)) {
+ if (b->c.level == 1 &&
+ bpos_lt(*pulled_from_scan, b->key.k.p)) {
+ ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+ bpos_successor(child->key.k.p), b->key.k.p);
+ if (ret)
+ goto err;
+
+ *pulled_from_scan = b->key.k.p;
+ ret = DID_FILL_FROM_SCAN;
+ } else {
+ ret = set_node_max(c, child, b->key.k.p);
+ }
}
err:
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+ struct bpos *pulled_from_scan)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
- bool have_child, dropped_children = false;
+ bool have_child, new_pass = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
return 0;
-again:
- prev = NULL;
- have_child = dropped_children = false;
+
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+again:
+ cur = prev = NULL;
+ have_child = new_pass = false;
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -406,7 +359,7 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(ret == -EIO, c,
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
@@ -414,11 +367,17 @@ again:
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
+ cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- cur = NULL;
if (ret)
break;
+
+ if (!btree_id_is_alloc(b->c.btree_id)) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+ }
continue;
}
@@ -426,7 +385,23 @@ again:
if (ret)
break;
- ret = btree_repair_node_boundaries(c, b, prev, cur);
+ if (bch2_btree_node_is_stale(c, cur)) {
+ bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(trans, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ cur = NULL;
+ if (ret)
+ break;
+ continue;
+ }
+
+ ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
@@ -444,6 +419,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
+ bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@@ -451,8 +427,6 @@ again:
break;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
goto again;
} else if (ret)
break;
@@ -464,7 +438,11 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev);
+ ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ if (ret == DID_FILL_FROM_SCAN) {
+ new_pass = true;
+ ret = 0;
+ }
}
if (!IS_ERR_OR_NULL(prev))
@@ -478,7 +456,12 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ if (new_pass)
+ goto again;
+
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -493,7 +476,7 @@ again:
if (ret)
goto err;
- ret = bch2_btree_repair_topology_recurse(trans, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
six_unlock_read(&cur->c.lock);
cur = NULL;
@@ -501,7 +484,7 @@ again:
bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
- dropped_children = true;
+ new_pass = true;
}
if (ret)
@@ -528,12 +511,14 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- if (!ret && dropped_children)
+ if (!ret && new_pass)
goto again;
+ BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
return ret;
}
@@ -541,32 +526,63 @@ fsck_err:
int bch2_check_topology(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree *b;
- unsigned i;
+ struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
+ bool reconstructed_root = false;
- if (!r->alive)
- continue;
+ if (r->error) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+reconstruct_root:
+ bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
- b = r->b;
- if (btree_node_fake(b))
- continue;
+ r->alive = false;
+ r->error = 0;
+
+ if (!bch2_btree_has_scanned_nodes(c, i)) {
+ mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ } else {
+ bch2_btree_root_alloc_fake(c, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ break;
+ }
+
+ reconstructed_root = true;
+ }
+
+ struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b);
+ ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch_err(c, "empty btree root - repair unimplemented");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ r->b = NULL;
+
+ if (!reconstructed_root)
+ goto reconstruct_root;
+
+ bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+ bch2_btree_root_alloc_fake(c, i, 0);
+ r->alive = false;
+ ret = 0;
}
}
-
+fsck_err:
bch2_trans_put(trans);
-
return ret;
}
@@ -589,18 +605,17 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
-
- if (!g->gen_valid &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c);
+
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -609,16 +624,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@@ -631,35 +645,34 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
- if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
- (c->opts.reconstruct_alloc ||
- fsck_err(c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
continue;
if (fsck_err_on(bucket_data_type(g->data_type) &&
- bucket_data_type(g->data_type) != data_type, c,
+ bucket_data_type(g->data_type) !=
+ bucket_data_type(data_type), c,
ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
@@ -700,18 +713,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
if (do_update) {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct bkey_i *new;
-
if (is_root) {
bch_err(c, "cannot update btree roots yet");
ret = -EINVAL;
goto err;
}
- new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
ret = -BCH_ERR_ENOMEM_gc_repair_key;
bch_err_msg(c, ret, "allocating new key");
@@ -726,7 +734,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
* btree node isn't there anymore, the read path will
* sort it out:
*/
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
@@ -734,19 +742,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
ptr->gen = g->gen;
}
} else {
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
- (ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
- (!ptr->cached &&
- gen_cmp(ptr->gen, g->gen) < 0) ||
- gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type);
- }));
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+ if ((p.ptr.cached &&
+ (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+ (!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->gen) < 0) ||
+ gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type)) {
+ bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+ goto restart_drop_ptrs;
+ }
+ }
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_extent_entry_for_each(ptrs, entry) {
@@ -776,12 +791,6 @@ found:
}
}
- ret = bch2_journal_key_insert_take(c, btree_id, level, new);
- if (ret) {
- kfree(new);
- goto err;
- }
-
if (level)
bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
@@ -795,6 +804,12 @@ found:
bch_info(c, "new key %s", buf.buf);
}
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
+ kfree(new);
+ goto err;
+ }
+
*k = bkey_i_to_s_c(new);
}
err:
@@ -813,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+ struct printbuf buf = PRINTBUF;
int ret = 0;
deleted.p = k->k->p;
@@ -821,10 +837,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
- if (ret)
- goto err;
-
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
@@ -833,52 +845,57 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k->k->version.lo);
}
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+ if (ret)
+ goto err;
+
+ if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+ c, btree_bitmap_not_marked,
+ "btree ptr not marked in member info btree allocated bitmap\n %s",
+ (bch2_bkey_val_to_text(&buf, c, *k),
+ buf.buf))) {
+ mutex_lock(&c->sb_lock);
+ bch2_dev_btree_bitmap_mark(c, *k);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
ret = commit_do(trans, NULL, NULL, 0,
- bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
+ bch2_key_trigger(trans, btree_id, level, old,
+ unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
- struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
- struct bkey_buf prev, cur;
int ret = 0;
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
- bch2_bkey_buf_init(&prev);
- bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, initial);
if (ret)
- break;
+ return ret;
bch2_btree_node_iter_advance(&iter, b);
-
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
-
- ret = bch2_gc_check_topology(c, b, &prev, cur,
- bch2_btree_node_iter_end(&iter));
- if (ret)
- break;
- }
}
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret;
+ return 0;
}
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
@@ -927,14 +944,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_buf cur, prev;
+ struct bkey_buf cur;
struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
- bch2_bkey_buf_init(&prev);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&cur);
- bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -945,25 +964,13 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (ret)
goto fsck_err;
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&cur, c, k);
- k = bkey_i_to_s_c(cur.k);
-
- bch2_btree_and_journal_iter_advance(&iter);
-
- ret = bch2_gc_check_topology(c, b,
- &prev, cur,
- !bch2_btree_and_journal_iter_peek(&iter).k);
- if (ret)
- goto fsck_err;
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
- }
+ bch2_btree_and_journal_iter_advance(&iter);
}
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+ iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
@@ -976,7 +983,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
false);
ret = PTR_ERR_OR_ZERO(child);
- if (ret == -EIO) {
+ if (bch2_err_matches(ret, EIO)) {
bch2_topology_error(c);
if (__fsck_err(c,
@@ -1016,7 +1023,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
return ret;
@@ -1034,9 +1040,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
b = bch2_btree_id_root(c, btree_id)->b;
- if (btree_node_fake(b))
- return 0;
-
six_lock_read(&b->c.lock, NULL, NULL);
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
@@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
- kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
+ kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
@@ -1365,11 +1366,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
- struct bucket gc, *b;
+ struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
struct bch_alloc_v4 old_convert, new;
const struct bch_alloc_v4 *old;
- enum bch_data_type type;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
@@ -1377,28 +1377,29 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
b = gc_bucket(ca, iter->pos.offset);
+ old_gc = *b;
+
+ if ((old->data_type == BCH_DATA_sb ||
+ old->data_type == BCH_DATA_journal) &&
+ !bch2_dev_is_online(ca)) {
+ b->data_type = old->data_type;
+ b->dirty_sectors = old->dirty_sectors;
+ }
/*
* b->data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
- type = __alloc_data_type(b->dirty_sectors,
- b->cached_sectors,
- b->stripe,
- *old,
- b->data_type);
- if (b->data_type != type) {
- struct bch_dev_usage *u;
-
- preempt_disable();
- u = this_cpu_ptr(ca->usage_gc);
- u->d[b->data_type].buckets--;
- b->data_type = type;
- u->d[b->data_type].buckets++;
- preempt_enable();
- }
-
+ b->data_type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ *old,
+ b->data_type);
gc = *b;
+
+ if (gc.data_type != old_gc.data_type ||
+ gc.dirty_sectors != old_gc.dirty_sectors)
+ bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
percpu_up_read(&c->mark_lock);
if (metadata_only &&
@@ -1410,8 +1411,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (gen_after(old->gen, gc.gen))
return 0;
- if (c->opts.reconstruct_alloc ||
- fsck_err_on(new.data_type != gc.data_type, c,
+ if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
@@ -1422,8 +1422,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@@ -1491,7 +1490,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
@@ -1585,17 +1584,18 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
" should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
- return ret;
+ goto out;
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
else
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+ ret = bch2_trans_update(trans, iter, new, 0);
}
+out:
fsck_err:
printbuf_exit(&buf);
return ret;
@@ -1817,10 +1817,10 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- ret = bch2_gc_stripes_done(c, metadata_only) ?:
- bch2_gc_reflink_done(c, metadata_only) ?:
- bch2_gc_alloc_done(c, metadata_only) ?:
- bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_alloc_done(c, metadata_only) ?:
+ bch2_gc_done(c, initial, metadata_only) ?:
+ bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only);
bch2_journal_unblock(&c->journal);
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe32..debb0edc34 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
- vpfree(p, size);
+ kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -581,8 +581,7 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_bad_node:
bch2_print_string_as_lines(KERN_ERR, out.buf);
- bch2_topology_error(c);
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+ ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -655,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
*/
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ b->nr = bch2_btree_node_count_keys(b);
struct bkey_s_c k;
struct bkey unpacked;
@@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
struct bset *i, struct bkey_packed *k)
{
if (bkey_p_next(k) > vstruct_last(i))
@@ -840,6 +840,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
+ if (!bkeyp_u64s_valid(&b->format, k))
+ return false;
+
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@@ -881,7 +884,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
- /* XXX: validate k->u64s */
+ if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_u64s,
+ "bad k->u64s %u (min %u max %zu)", k->u64s,
+ bkeyp_key_u64s(&b->format, k),
+ U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
+ goto drop_this_key;
+
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@@ -938,13 +949,12 @@ drop_this_key:
* do
*/
- if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
for (next_good_key = 1;
next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
next_good_key++)
- if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
goto got_good_key;
-
}
/*
@@ -1058,7 +1068,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1099,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting btree node: %i\n", ret))
+ "decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
@@ -1255,10 +1265,12 @@ out:
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
- else
+ } else {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
+ }
goto out;
}
@@ -1319,6 +1331,7 @@ start:
if (!can_retry) {
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
break;
}
}
@@ -1327,10 +1340,12 @@ start:
rb->start_time);
bio_put(&rb->bio);
- if (saw_error && !btree_node_read_error(b)) {
+ if (saw_error &&
+ !btree_node_read_error(b) &&
+ c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
- bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+ bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
__func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
@@ -1518,9 +1533,10 @@ fsck_err:
ret = -1;
}
- if (ret)
+ if (ret) {
set_btree_node_read_error(b);
- else if (*saw_error)
+ bch2_btree_lost_data(c, b->c.btree_id);
+ } else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
@@ -1649,13 +1665,14 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err(c, "%s", buf.buf);
+ bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
set_btree_node_read_error(b);
+ bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
@@ -1737,7 +1754,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
- ret = -EIO;
+ ret = -BCH_ERR_btree_node_read_error;
goto err;
}
@@ -1841,7 +1858,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_write_all_failed;
+ ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}
@@ -1852,7 +1869,7 @@ static void btree_node_write_work(struct work_struct *work)
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
@@ -1866,8 +1883,8 @@ out:
return;
err:
set_btree_node_noevict(b);
- if (!bch2_err_matches(ret, EROFS))
- bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+ bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+ "writing btree node: %s", bch2_err_str(ret));
goto out;
}
@@ -2123,7 +2140,7 @@ do_write:
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
- "error encrypting btree node: %i\n", ret))
+ "encrypting btree node: %s", bch2_err_str(ret)))
goto err;
nonce = btree_nonce(i, b->written << 9);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ef338df82..2a211a4beb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
- __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
@@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+ struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+ if (!k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ goto err;
+ }
+
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
if ((flags & BTREE_ITER_PREFETCH) &&
c->opts.btree_node_prefetch) {
@@ -962,7 +976,6 @@ err:
return ret;
}
-
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -1146,7 +1159,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out;
+ goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
@@ -1179,7 +1192,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
-
+out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -1520,7 +1533,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
- void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
nr * sizeof(btree_path_idx_t) + 8 +
@@ -1729,7 +1742,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(trans->paths + iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (btree_path_node(path, path->level))
+ btree_path_set_should_be_locked(path);
return 0;
}
@@ -2305,7 +2320,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
- return bkey_s_c_err(-EIO);
+ return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2503,6 +2518,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
iter->k = iter2.k;
k.k = &iter->k;
}
@@ -2762,6 +2778,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
struct btree_trans *trans = src->trans;
*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+ dst->ip_allocated = _RET_IP_;
+#endif
if (src->path)
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
@@ -2784,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
+ if (trans->used_mempool) {
+ if (trans->mem_bytes >= new_bytes)
+ goto out_change_top;
+
+ /* No more space from mempool item, need malloc new one */
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_mem);
+ return ERR_PTR(ret);
+ }
+ }
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = false;
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
+ goto out_new_mem;
+ }
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
@@ -2792,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
+ memcpy(new_mem, trans->mem, trans->mem_top);
+ trans->used_mempool = true;
kfree(trans->mem);
}
@@ -2805,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-
+out_new_mem:
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -2813,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
-
+out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3085,9 +3131,9 @@ void bch2_trans_put(struct btree_trans *trans)
trans->paths = NULL;
if (paths_allocated != trans->_paths_allocated)
- kfree_rcu_mightsleep(paths_allocated);
+ kvfree_rcu_mightsleep(paths_allocated);
- if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+ if (trans->used_mempool)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4..1c70836dd7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- if (!trans->restarted)
- btree_iter_path(trans, iter)->preserve = false;
+ if (!iter->path || trans->restarted)
+ return;
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ path->preserve = false;
+ if (path->ref == 1)
+ path->should_be_locked = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -642,7 +647,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a849..1e8cf49a69 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "bset.h"
+#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
- return keys->d + idx_to_pos(keys, idx);
+ return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
@@ -128,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
+static void journal_iter_verify(struct journal_iter *iter)
+{
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ BUG_ON(cmp < 0);
+ }
+}
+
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -141,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -180,33 +205,38 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
+ journal_key_cmp(&n, &keys->data[idx]) == 0) {
+ if (keys->data[idx].allocated)
+ kfree(keys->data[idx].k);
+ keys->data[idx] = n;
return 0;
}
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
- if (!new_keys.d) {
+ new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+ if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
- memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
- kvfree(keys->d);
- keys->d = new_keys.d;
+ memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+ kvfree(keys->data);
+ keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
@@ -214,13 +244,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
- move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
- keys->gap = idx;
+ move_gap(keys, idx);
keys->nr++;
- keys->d[keys->gap++] = n;
+ keys->data[keys->gap++] = n;
journal_iters_fix(c);
@@ -260,6 +289,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
return bch2_journal_key_insert(c, id, level, &whiteout);
}
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ return (idx < keys->size &&
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos) &&
+ bkey_deleted(&keys->data[idx].k->k));
+}
+
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@@ -267,10 +312,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
- keys->d[idx].btree_id == btree &&
- keys->d[idx].level == level &&
- bpos_eq(keys->d[idx].k->k.p, pos))
- keys->d[idx].overwritten = true;
+ keys->data[idx].btree_id == btree &&
+ keys->data[idx].level == level &&
+ bpos_eq(keys->data[idx].k->k.p, pos))
+ keys->data[idx].overwritten = true;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -284,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->d + iter->idx;
+ journal_iter_verify(iter);
+
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ if (cmp > 0)
+ break;
+ BUG_ON(cmp);
- while (k < iter->keys->d + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->d + iter->idx;
}
return bkey_s_c_null;
@@ -313,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -334,9 +386,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+ struct btree_and_journal_iter iter = *_iter;
+ struct bch_fs *c = iter.trans->c;
+ unsigned level = iter.journal.level;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
+ ? (level > 1 ? 0 : 2)
+ : (level > 1 ? 1 : 16);
+
+ iter.prefetch = false;
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr--) {
+ bch2_btree_and_journal_iter_advance(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c btree_k, journal_k, ret;
+ struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+
+ if (iter->prefetch && iter->journal.level)
+ btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
@@ -345,9 +426,10 @@ again:
bpos_lt(btree_k.k->p, iter->pos))
bch2_journal_iter_advance_btree(iter);
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
+ if (iter->trans->journal_replay_not_finished)
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
ret = journal_k.k &&
(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -376,35 +458,40 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+ struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &c->journal_iters);
+ __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
}
/* sort and dedup all keys in the journal: */
@@ -415,9 +502,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
- if (*i)
- kvpfree(*i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&(*i)->j));
+ kvfree(*i);
genradix_free(&c->journal_entries);
}
@@ -437,22 +522,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
+ move_gap(keys, keys->nr);
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->allocated)
kfree(i->k);
- kvfree(keys->d);
- keys->d = NULL;
+ kvfree(keys->data);
+ keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
@@ -460,83 +543,38 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- struct journal_key *src, *dst;
+ sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
- sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+ struct journal_key *dst = keys->data;
- src = dst = keys->d;
- while (src < keys->d + keys->nr) {
- while (src + 1 < keys->d + keys->nr &&
- !journal_key_cmp(src, src + 1))
- src++;
+ darray_for_each(*keys, src) {
+ if (src + 1 < &darray_top(*keys) &&
+ !journal_key_cmp(src, src + 1))
+ continue;
- *dst++ = *src++;
+ *dst++ = *src;
}
- keys->nr = dst - keys->d;
+ keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
- struct jset_entry *entry;
- struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys;
- size_t nr_keys = 0, nr_read = 0;
+ size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
- continue;
-
- for_each_jset_key(k, entry, &i->j)
- nr_keys++;
- }
-
- if (!nr_keys)
- return 0;
-
- keys->size = roundup_pow_of_two(nr_keys);
-
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- if (!keys->d) {
- bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
- nr_keys);
-
- do {
- keys->size >>= 1;
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- } while (!keys->d && keys->size > nr_keys / 8);
-
- if (!keys->d) {
- bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
- keys->size);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
- if (keys->nr == keys->size) {
- __journal_keys_sort(keys);
-
- if (keys->nr > keys->size * 7 / 8) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
- keys->nr, keys->size, nr_read, nr_keys);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- keys->d[keys->nr++] = (struct journal_key) {
+ struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
@@ -544,6 +582,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.journal_offset = k->_data - i->j._data,
};
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+
+ BUG_ON(darray_push(keys, n));
+ }
+
nr_read++;
}
}
@@ -551,6 +601,25 @@ int bch2_journal_keys_sort(struct bch_fs *c)
__journal_keys_sort(keys);
keys->gap = keys->nr;
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+ unsigned level_min, unsigned level_max,
+ struct bpos start, struct bpos end)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t dst = 0;
+
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i)
+ if (!(i->btree_id == btree &&
+ i->level >= level_min &&
+ i->level <= level_max &&
+ bpos_ge(i->k->k.p, start) &&
+ bpos_le(i->k->k.p, end)))
+ keys->data[dst++] = *i;
+ keys->nr = keys->gap = dst;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2..af25046ebc 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
+ struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
+ bool prefetch;
};
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,25 +31,27 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+ struct btree_and_journal_iter *);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *,
- struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+ struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);
@@ -62,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+ unsigned, unsigned,
+ struct bpos, struct bpos);
+
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74e52fd28a..7dafa1acce 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
@@ -380,9 +382,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
- k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
- BTREE_ITER_KEY_CACHE_FILL|
- BTREE_ITER_CACHED_NOFILL);
+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ BTREE_ITER_KEY_CACHE_FILL|
+ BTREE_ITER_CACHED_NOFILL);
+ iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -657,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
@@ -674,7 +678,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
!bch2_journal_error(j), c,
- "error flushing key cache: %s", bch2_err_str(ret));
+ "flushing key cache: %s", bch2_err_str(ret));
if (ret)
goto out;
@@ -838,8 +842,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- scanned += bc->nr_freed_nonpcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -853,11 +855,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_nonpcpu--;
}
- if (scanned >= nr)
- goto out;
-
- scanned += bc->nr_freed_pcpu;
-
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@@ -871,9 +868,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_pcpu--;
}
- if (scanned >= nr)
- goto out;
-
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (bc->shrink_iter >= tbl->size)
@@ -889,12 +883,12 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
ck = container_of(pos, struct bkey_cached, hash);
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
goto next;
-
- if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- else if (bkey_cached_lock_for_evict(ck)) {
+ goto next;
+ } else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
}
@@ -912,7 +906,6 @@ next:
} while (scanned < nr && bc->shrink_iter != start);
rcu_read_unlock();
-out:
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
@@ -963,13 +956,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#ifdef __KERNEL__
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
+ if (bc->pcpu_freed) {
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
+
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &items);
+ }
}
}
#endif
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 6843974423..f2caf49195 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
@@ -747,7 +721,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
return;
trans_for_each_path(trans, path, i)
- bch2_btree_path_downgrade(trans, path);
+ if (path->ref)
+ bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 0000000000..45cb8149d3
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+ struct closure *cl;
+ struct find_btree_nodes *f;
+ struct bch_dev *ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+ prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ bch2_bpos_to_text(out, n->min_key);
+ prt_str(out, "-");
+ bch2_bpos_to_text(out, n->max_key);
+
+ if (n->range_updated)
+ prt_str(out, " range updated");
+ if (n->overwritten)
+ prt_str(out, " overwritten");
+
+ for (unsigned i = 0; i < n->nr_ptrs; i++) {
+ prt_char(out, ' ');
+ bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+ }
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+ printbuf_indent_add(out, 2);
+ darray_for_each(nodes, i) {
+ found_btree_node_to_text(out, c, i);
+ prt_newline(out);
+ }
+ printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+ struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+ set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+ bp->k.p = f->max_key;
+ bp->v.seq = cpu_to_le64(f->cookie);
+ bp->v.sectors_written = 0;
+ bp->v.flags = 0;
+ bp->v.sectors_written = cpu_to_le16(f->sectors_written);
+ bp->v.min_key = f->min_key;
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+ memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+ struct found_btree_node *f)
+{
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+ found_btree_node_to_key(&k.k, f);
+
+ struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ bool ret = !IS_ERR_OR_NULL(b);
+ if (ret) {
+ f->sectors_written = b->written;
+ six_unlock_read(&b->c.lock);
+ }
+
+ /*
+ * We might update this node's range; if that happens, we need the node
+ * to be re-read so the read path can trim keys that are no longer in
+ * this node
+ */
+ if (b != btree_node_root(trans->c, b))
+ bch2_btree_node_evict(trans, &k.k);
+ return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ cmp_int(l->cookie, r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+ const struct found_btree_node *r)
+{
+ return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->min_key, r->min_key) ?:
+ -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+ struct bio *bio, struct btree_node *bn, u64 offset)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, bn, PAGE_SIZE);
+
+ submit_bio_wait(bio);
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status)))
+ return;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return;
+
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+ struct nonce nonce = btree_nonce(&bn->keys, 0);
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+ bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+ }
+
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
+ if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+ return;
+
+ rcu_read_lock();
+ struct found_btree_node n = {
+ .btree_id = BTREE_NODE_ID(bn),
+ .level = BTREE_NODE_LEVEL(bn),
+ .seq = BTREE_NODE_SEQ(bn),
+ .cookie = le64_to_cpu(bn->keys.seq),
+ .min_key = bn->min_key,
+ .max_key = bn->max_key,
+ .nr_ptrs = 1,
+ .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .ptrs[0].offset = offset,
+ .ptrs[0].dev = ca->dev_idx,
+ .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ };
+ rcu_read_unlock();
+
+ if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ mutex_lock(&f->lock);
+ if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+ bch_err(c, "try_read_btree_node() can't handle endian conversion");
+ f->ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (darray_push(&f->nodes, n))
+ f->ret = -ENOMEM;
+unlock:
+ mutex_unlock(&f->lock);
+ }
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+ struct find_btree_nodes_worker *w = p;
+ struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+ struct bch_dev *ca = w->ca;
+ void *buf = (void *) __get_free_page(GFP_KERNEL);
+ struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+ unsigned long last_print = jiffies;
+
+ if (!buf || !bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
+
+ for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+ for (unsigned bucket_offset = 0;
+ bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+ bucket_offset += btree_sectors(c)) {
+ if (time_after(jiffies, last_print + HZ * 30)) {
+ u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+ u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+ bch_info(ca, "%s: %2u%% done", __func__,
+ (unsigned) div64_u64(cur_sector * 100, end_sector));
+ last_print = jiffies;
+ }
+
+ u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+ continue;
+
+ try_read_btree_node(w->f, ca, bio, buf, sector);
+ }
+err:
+ bio_put(bio);
+ free_page((unsigned long) buf);
+ percpu_ref_get(&ca->io_ref);
+ closure_put(w->cl);
+ kfree(w);
+ return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+ struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct closure cl;
+ int ret = 0;
+
+ closure_init_stack(&cl);
+
+ for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
+ struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+ struct task_struct *t;
+
+ if (!w) {
+ percpu_ref_put(&ca->io_ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ percpu_ref_get(&ca->io_ref);
+ closure_get(&cl);
+ w->cl = &cl;
+ w->f = f;
+ w->ca = ca;
+
+ t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ ret = IS_ERR_OR_NULL(t);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ closure_put(&cl);
+ f->ret = ret;
+ bch_err(c, "error starting kthread: %i", ret);
+ break;
+ }
+ }
+err:
+ closure_sync(&cl);
+ return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+ while (n + 1 < end &&
+ found_btree_node_cmp_pos(n, n + 1) > 0) {
+ swap(n[0], n[1]);
+ n++;
+ }
+}
+
+static int handle_overwrites(struct bch_fs *c,
+ struct found_btree_node *start,
+ struct found_btree_node *end)
+{
+ struct found_btree_node *n;
+again:
+ for (n = start + 1;
+ n < end &&
+ n->btree_id == start->btree_id &&
+ n->level == start->level &&
+ bpos_lt(n->min_key, start->max_key);
+ n++) {
+ int cmp = found_btree_node_cmp_time(start, n);
+
+ if (cmp > 0) {
+ if (bpos_cmp(start->max_key, n->max_key) >= 0)
+ n->overwritten = true;
+ else {
+ n->range_updated = true;
+ n->min_key = bpos_successor(start->max_key);
+ n->range_updated = true;
+ bubble_up(n, end);
+ goto again;
+ }
+ } else if (cmp < 0) {
+ BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+ start->max_key = bpos_predecessor(n->min_key);
+ start->range_updated = true;
+ } else if (n->level) {
+ n->overwritten = true;
+ } else {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
+ found_btree_node_to_text(&buf, c, start);
+ prt_str(&buf, "\n ");
+ found_btree_node_to_text(&buf, c, n);
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+ struct printbuf buf = PRINTBUF;
+ size_t dst;
+ int ret = 0;
+
+ if (f->nodes.nr)
+ return 0;
+
+ mutex_init(&f->lock);
+
+ ret = read_btree_nodes(f);
+ if (ret)
+ return ret;
+
+ if (!f->nodes.nr) {
+ bch_err(c, "%s: no btree nodes found", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+ if (prev &&
+ prev->cookie == i->cookie) {
+ if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+ bch_err(c, "%s: found too many replicas for btree node", __func__);
+ ret = -EINVAL;
+ goto err;
+ }
+ prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+ } else {
+ f->nodes.data[dst++] = *i;
+ }
+ }
+ f->nodes.nr = dst;
+
+ sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+ if (0 && c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ dst = 0;
+ darray_for_each(f->nodes, i) {
+ if (i->overwritten)
+ continue;
+
+ ret = handle_overwrites(c, i, &darray_top(f->nodes));
+ if (ret)
+ goto err;
+
+ BUG_ON(i->overwritten);
+ f->nodes.data[dst++] = *i;
+ }
+ f->nodes.nr = dst;
+
+ if (c->opts.verbose) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+ found_btree_nodes_to_text(&buf, c, f->nodes);
+ bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ }
+
+ eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+ const struct found_btree_node *l = _l;
+ const struct found_btree_node *r = _r;
+
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->max_key, r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx) \
+ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
+ sizeof((_f)->nodes.data[0]), \
+ found_btree_node_range_start_cmp, &search); \
+ _idx < (_f)->nodes.nr && \
+ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
+ (_f)->nodes.data[_idx].level == _search.level && \
+ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
+ _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ struct found_btree_node search = {
+ .btree_id = b->c.btree_id,
+ .level = b->c.level,
+ .min_key = b->data->min_key,
+ .max_key = b->key.k.p,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx)
+ if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+ return true;
+ return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = 0,
+ .min_key = POS_MIN,
+ .max_key = SPOS_MAX,
+ };
+
+ for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+ return true;
+ return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos node_min, struct bpos node_max)
+{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
+ struct find_btree_nodes *f = &c->found_btree_nodes;
+
+ int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+ bch2_bpos_to_text(&buf, node_min);
+ prt_str(&buf, " - ");
+ bch2_bpos_to_text(&buf, node_max);
+
+ bch_info(c, "%s(): %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ struct found_btree_node search = {
+ .btree_id = btree,
+ .level = level,
+ .min_key = node_min,
+ .max_key = node_max,
+ };
+
+ for_each_found_btree_node_in_range(f, search, idx) {
+ struct found_btree_node n = f->nodes.data[idx];
+
+ n.range_updated |= bpos_lt(n.min_key, node_min);
+ n.min_key = bpos_max(n.min_key, node_min);
+
+ n.range_updated |= bpos_gt(n.max_key, node_max);
+ n.max_key = bpos_min(n.max_key, node_max);
+
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+ found_btree_node_to_key(&tmp.k, &n);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+
+ BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+ ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+ darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 0000000000..08687b2097
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 0000000000..5cfaeb5ac8
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+ bool range_updated:1;
+ bool overwritten:1;
+ u8 btree_id;
+ u8 level;
+ unsigned sectors_written;
+ u32 seq;
+ u64 cookie;
+
+ struct bpos min_key;
+ struct bpos max_key;
+
+ unsigned nr_ptrs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node) found_btree_nodes;
+
+struct find_btree_nodes {
+ int ret;
+ struct mutex lock;
+ found_btree_nodes nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d13..bbec91e8e6 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
+ unsigned watermark = flags & BCH_WATERMARK_MASK;
EBUG_ON(path->level);
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+ if (watermark < BCH_WATERMARK_reclaim &&
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(c))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -499,9 +500,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+ unsigned btree_id_start)
{
- struct btree_insert_entry *i;
bool trans_trigger_run;
int ret, overwrite;
@@ -514,13 +514,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
do {
trans_trigger_run = false;
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ for (unsigned i = btree_id_start;
+ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
i++) {
- if (i->btree_id != btree_id)
+ if (trans->updates[i].btree_id != btree_id)
continue;
- ret = run_one_trans_trigger(trans, i, overwrite);
+ ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
if (ret < 0)
return ret;
if (ret)
@@ -534,8 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *btree_id_start = trans->updates;
- unsigned btree_id = 0;
+ unsigned btree_id = 0, btree_id_start = 0;
int ret = 0;
/*
@@ -549,8 +548,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
if (btree_id == BTREE_ID_alloc)
continue;
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
+ while (btree_id_start < trans->nr_updates &&
+ trans->updates[btree_id_start].btree_id < btree_id)
btree_id_start++;
ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +557,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
return ret;
}
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
+
if (i->btree_id > BTREE_ID_alloc)
break;
if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
if (ret)
return ret;
break;
@@ -826,7 +827,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
struct bch_fs *c = trans->c;
int ret = 0, u64s_delta = 0;
- trans_for_each_update(trans, i) {
+ for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+ struct btree_insert_entry *i = trans->updates + idx;
if (i->cached)
continue;
@@ -887,6 +889,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
switch (ret) {
case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +908,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499e..c69b233c41 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
+#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
@@ -173,6 +174,11 @@ struct btree_cache {
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
+
+ struct bbpos pinned_nodes_start;
+ struct bbpos pinned_nodes_end;
+ u64 pinned_nodes_leaf_mask;
+ u64 pinned_nodes_interior_mask;
};
struct btree_node_iter {
@@ -315,9 +321,9 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
+ unsigned long btree_trans_barrier_seq;
u16 u64s;
bool valid;
- u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -358,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
@@ -654,6 +674,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_subvolumes)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
@@ -727,7 +748,7 @@ struct btree_root {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
- s8 error;
+ s16 error;
};
enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acc..8e47e260eb 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans,
struct bkey_i *update;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
update = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(update);
if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
struct bch_fs *c = trans->c;
int ret;
+ if (unlikely(trans->journal_replay_not_finished))
+ return 0;
+
ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
if (ret < 0)
@@ -452,7 +458,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
- if (path->cached && bkey_deleted(&i->old_k))
+ if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;
@@ -788,6 +794,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ int ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k->k.p = pos;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos, bool set)
+{
struct bkey_i k;
bkey_init(&k.k);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b28..cc7c53e83f 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
- return bch2_btree_bit_mod(trans, btree, pos, false);
+ return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2..b4efd8cc4d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -18,15 +19,23 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-members.h"
#include "super-io.h"
#include "trace.h"
#include <linux/random.h>
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+ BTREE_UPDATE_MODES()
+#undef x
+ NULL
+};
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *,
- struct keylist *, unsigned);
+ btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@@ -45,56 +54,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
return path_idx;
}
-/* Debug code: */
-
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos next_node = b->data->min_key;
- struct btree_node_iter iter;
+ struct bch_fs *c = trans->c;
+ struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+ : b->data->min_key;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_btree_ptr_v2 bp;
- struct bkey unpacked;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
+ struct bkey_buf prev;
+ int ret = 0;
- BUG_ON(!b->c.level);
+ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+ b->data->min_key));
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
+ if (!b->c.level)
+ return 0;
- bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bkey_init(&prev.k->k);
+ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- while (1) {
- k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
- break;
- bp = bkey_s_c_to_btree_ptr_v2(k);
+ goto out;
- if (!bpos_eq(next_node, bp.v->min_key)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, next_node);
- bch2_bpos_to_text(&buf2, bp.v->min_key);
- panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
- }
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_btree_node_iter_advance(&iter, b);
+ struct bpos expected_min = bkey_deleted(&prev.k->k)
+ ? node_min
+ : bpos_successor(prev.k->k.p);
- if (bch2_btree_node_iter_end(&iter)) {
- if (!bpos_eq(k.k->p, b->key.k.p)) {
- bch2_dump_btree_node(c, b);
- bch2_bpos_to_text(&buf1, b->key.k.p);
- bch2_bpos_to_text(&buf2, k.k->p);
- panic("expected end %s got %s\n", buf1.buf, buf2.buf);
- }
- break;
+ if (!bpos_eq(expected_min, bp.v->min_key)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "end of prev node doesn't match start of next node\n"),
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n prev ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_str(&buf, "\n next ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+ goto topology_repair;
}
- next_node = bpos_successor(k.k->p);
+ bch2_bkey_buf_reassemble(&prev, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
+ if (bkey_deleted(&prev.k->k)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "empty interior node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ goto topology_repair;
+ } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ bch2_topology_error(c);
+
+ printbuf_reset(&buf);
+ prt_str(&buf, "last child node doesn't end at end of parent node\n");
+ prt_printf(&buf, " in btree %s level %u node ",
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "\n last key ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+ need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+ goto topology_repair;
}
-#endif
+out:
+fsck_err:
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev, c);
+ printbuf_exit(&buf);
+ return ret;
+topology_repair:
+ if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
+ bch2_inconsistent_error(c);
+ ret = -BCH_ERR_btree_need_topology_repair;
+ } else {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+ }
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -255,7 +311,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+ unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
? BTREE_NODE_RESERVE
: 0;
int ret;
@@ -550,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as,
bch2_keylist_push(keys);
}
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+ for_each_keylist_key(&as->new_keys, k)
+ if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+ return false;
+ return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+ struct bch_fs *c = as->c;
+
+ mutex_lock(&c->sb_lock);
+ for_each_keylist_key(&as->new_keys, k)
+ bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/*
* The transactional part of an interior btree node update, where we journal the
* update we did to the interior node and update alloc info:
@@ -607,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
+ if (!btree_update_new_nodes_marked_sb(as))
+ btree_update_new_nodes_mark_sb(as);
+
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
@@ -639,7 +718,7 @@ static void btree_update_nodes_written(struct btree_update *as)
* which may require allocations as well.
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_reclaim|
+ BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_journal_reclaim,
@@ -647,11 +726,15 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s(): error %s", __func__, bch2_err_str(ret));
+ "%s", bch2_err_str(ret));
err:
- if (as->b) {
-
- b = as->b;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -795,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+ as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -825,7 +910,7 @@ static void btree_update_reparent(struct btree_update *as,
lockdep_assert_held(&c->btree_interior_update_lock);
child->b = NULL;
- child->mode = BTREE_INTERIOR_UPDATING_AS;
+ child->mode = BTREE_UPDATE_update;
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
bch2_update_reparent_journal_pin_flush);
@@ -836,7 +921,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -850,7 +935,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->mode = BTREE_UPDATE_root;
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1028,7 +1113,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
- BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(as->mode == BTREE_UPDATE_none);
if (as->took_gc_lock)
up_read(&as->c->gc_lock);
@@ -1045,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1053,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1068,29 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
- ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1,
- watermark|JOURNAL_RES_GET_CHECK));
+ bch2_trans_unlock(trans);
+ wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
+ ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1098,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1116,12 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1163,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
*/
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark != BCH_WATERMARK_reclaim) {
+ watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
@@ -1193,7 +1282,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
err:
bch2_btree_update_free(as, trans);
if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS))
+ !bch2_err_matches(ret, EROFS) &&
+ ret != -BCH_ERR_journal_reclaim_would_deadlock)
bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1208,33 +1298,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
- BUG_ON(btree_node_root(c, b) &&
- (b->c.level < btree_node_root(c, b)->c.level ||
- !btree_node_dying(btree_node_root(c, b))));
-
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ bool nofail)
{
struct bch_fs *c = as->c;
- struct btree *old;
trace_and_count(c, btree_node_set_root, trans, b);
- old = btree_node_root(c, b);
+ struct btree *old = btree_node_root(c, b);
/*
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ if (nofail) {
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+ } else {
+ int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+ if (ret)
+ return ret;
+ }
bch2_btree_set_root_inmem(c, b);
@@ -1248,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* depend on the new root would have to update the new root.
*/
bch2_btree_node_unlock_write(trans, path, old);
+ return 0;
}
/* Interior node updates: */
@@ -1314,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1378,9 +1471,16 @@ static void __btree_split_node(struct btree_update *as,
if (bkey_deleted(k))
continue;
+ uk = bkey_unpack_key(b, k);
+
+ if (b->c.level &&
+ u64s < n1_u64s &&
+ u64s + k->u64s >= n1_u64s &&
+ bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+ n1_u64s += k->u64s;
+
i = u64s >= n1_u64s;
u64s += k->u64s;
- uk = bkey_unpack_key(b, k);
if (!i)
n1_pos = uk.p;
bch2_bkey_format_add_key(&format[i], &uk);
@@ -1439,8 +1539,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
- if (b->c.level)
- btree_node_interior_verify(as->c, n[i]);
+ BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@@ -1469,15 +1568,15 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- btree_node_interior_verify(as->c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1486,9 +1585,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
u64 start_time = local_clock();
int ret = 0;
+ bch2_verify_btree_nr_keys(b);
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret)
+ return ret;
+
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@@ -1578,16 +1682,17 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
} else if (n3) {
- bch2_btree_set_root(as, trans, trans->paths + path, n3);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, trans->paths + path, n1);
+ ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
}
+ if (ret)
+ goto err;
+
if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1644,27 +1749,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1673,7 +1757,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
- * @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@@ -1683,10 +1766,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys, unsigned flags)
+ struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1709,9 +1793,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- btree_node_interior_verify(c, b);
+ ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
+ bch2_btree_node_unlock_write(trans, path, b);
+ return ret;
+ }
+
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1725,21 +1819,22 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
- btree_node_interior_verify(c, b);
+ BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path_idx, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1842,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
-
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@@ -1759,7 +1853,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
- ret = btree_split(as, trans, path, b, NULL, flags);
+ ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@@ -1775,6 +1869,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+ btree_path_idx_t path_idx)
+{
+ struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
+ struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+ BUG_ON(!btree_node_locked(path, b->c.level));
+
+ n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n);
+ six_unlock_write(&n->c.lock);
+
+ path->locks_want++;
+ BUG_ON(btree_node_locked(path, n->c.level));
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, path, n);
+
+ n->sib_u64s[0] = U16_MAX;
+ n->sib_u64s[1] = U16_MAX;
+
+ bch2_keylist_add(&as->parent_keys, &b->key);
+ btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+ int ret = bch2_btree_set_root(as, trans, path, n, true);
+ BUG_ON(ret);
+
+ bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+ bch2_trans_node_add(trans, path, n);
+ six_unlock_intent(&n->c.lock);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_add_tail(&b->list, &c->btree_cache.live);
+ mutex_unlock(&c->btree_cache.lock);
+
+ bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+ if (btree_node_fake(b))
+ return bch2_btree_split_leaf(trans, path, flags);
+
+ struct btree_update *as =
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
+
+ __btree_increase_depth(as, trans, path);
+ bch2_btree_update_done(as, trans);
+ return 0;
+}
+
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@@ -1797,6 +1950,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
BUG_ON(!trans->paths[path].should_be_locked);
BUG_ON(!btree_node_locked(&trans->paths[path], level));
+ /*
+ * Work around a deadlock caused by the btree write buffer not doing
+ * merges and leaving tons of merges for us to do - we really don't need
+ * to be doing merges at all from the interior update path, and if the
+ * interior update path is generating too many new interior updates we
+ * deadlock:
+ */
+ if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+ return 0;
+
+ if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
+ flags &= ~BCH_WATERMARK_MASK;
+ flags |= BCH_WATERMARK_btree;
+ flags |= BCH_TRANS_COMMIT_journal_reclaim;
+ }
+
b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -1845,8 +2014,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
__func__, buf1.buf, buf2.buf);
printbuf_exit(&buf1);
printbuf_exit(&buf2);
- bch2_topology_error(c);
- ret = -EIO;
+ ret = bch2_topology_error(c);
goto err;
}
@@ -1916,7 +2084,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
@@ -1943,6 +2111,10 @@ err:
bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ ret = 0;
+ if (!ret)
+ ret = bch2_trans_relock(trans);
return ret;
err_free_update:
bch2_btree_node_free_never_used(as, trans, n);
@@ -1987,14 +2159,14 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path,
- parent, &as->parent_keys, flags);
- if (ret)
- goto err;
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
} else {
- bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+ ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
}
+ if (ret)
+ goto err;
+
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -2069,7 +2241,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- bch_err_fn(c, ret);
+ bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2116,7 +2288,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
}
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2128,7 +2300,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
list_del(&a->list);
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_interior_update_worker, &a->work);
+ queue_work(c->btree_node_rewrite_worker, &a->work);
}
mutex_unlock(&c->pending_node_rewrites_lock);
}
@@ -2339,7 +2511,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
{
struct bch_fs *c = trans->c;
struct closure cl;
@@ -2358,7 +2530,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
- b->c.level = 0;
+ b->c.level = level;
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
@@ -2385,9 +2557,23 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
return 0;
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+ bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+ prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ (void *) as->ip_started,
+ bch2_btree_id_str(as->btree_id),
+ as->update_level_start,
+ as->update_level_end,
+ bch2_watermarks[as->watermark],
+ bch2_btree_update_modes[as->mode],
+ as->nodes_written,
+ closure_nr_remaining(&as->cl),
+ as->journal.seq);
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2396,12 +2582,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- prt_printf(out, "%p m %u w %u r %u j %llu\n",
- as,
- as->mode,
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
+ bch2_btree_update_to_text(out, as);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -2465,6 +2646,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ if (c->btree_node_rewrite_worker)
+ destroy_workqueue(c->btree_node_rewrite_worker);
if (c->btree_interior_update_worker)
destroy_workqueue(c->btree_interior_update_worker);
mempool_exit(&c->btree_interior_update_pool);
@@ -2485,10 +2668,15 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ c->btree_node_rewrite_worker =
+ alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+ if (!c->btree_node_rewrite_worker)
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c593c925d1..c1a479ebaa 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES() \
+ x(none) \
+ x(node) \
+ x(root) \
+ x(update)
+
+enum btree_update_mode {
+#define x(n) BTREE_UPDATE_##n,
+ BTREE_UPDATE_MODES()
+#undef x
+};
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -32,28 +46,24 @@ struct btree_update {
struct closure cl;
struct bch_fs *c;
u64 start_time;
+ unsigned long ip_started;
struct list_head list;
struct list_head unwritten_list;
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
+ enum btree_update_mode mode;
+ enum bch_watermark watermark;
unsigned nodes_written:1;
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
/*
- * BTREE_INTERIOR_UPDATING_NODE:
+ * BTREE_UPDATE_node:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
@@ -119,6 +129,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
@@ -160,7 +172,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
struct bkey_i *, unsigned, bool);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac78448619..36a6f42aba 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -11,6 +11,7 @@
#include "journal_reclaim.h"
#include <linux/prefetch.h>
+#include <linux/sort.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
@@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
/* Compare excluding idx, the low 24 bits: */
static inline bool wb_key_eq(const void *_l, const void *_r)
{
@@ -307,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
write_locked = false;
+
+ ret = lockrestart_do(trans,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_foreground_maybe_merge(trans, iter.path, 0,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc));
+ if (ret)
+ goto err;
}
}
@@ -357,6 +376,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+ sort(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
+
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
continue;
@@ -368,17 +392,17 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
ret = commit_do(trans, NULL, NULL,
BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
+ BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
}
}
err:
- bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
bch2_journal_pin_drop(j, &wb->flushing.pin);
wb->flushing.keys.nr = 0;
@@ -574,8 +598,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
{
struct journal_keys_to_wb dst;
- struct jset_entry *entry;
- struct bkey_i *k;
int ret = 0;
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
@@ -590,7 +612,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
entry->type = BCH_JSET_ENTRY_btree_keys;
}
+ spin_lock(&c->journal.lock);
buf->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
out:
bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac4..82f1792588 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -815,14 +815,14 @@ static int __mark_pointer(struct btree_trans *trans,
static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 *sectors,
- unsigned flags)
+ const union bch_extent_entry *entry,
+ s64 *sectors, unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
struct bpos bucket;
struct bch_backpointer bp;
- bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp);
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
@@ -851,7 +851,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_GC) {
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
percpu_down_read(&c->mark_lock);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
@@ -979,7 +979,7 @@ static int __trigger_extent(struct btree_trans *trans,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
@@ -990,8 +990,8 @@ static int __trigger_extent(struct btree_trans *trans,
ret = !gc
? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
- bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
- __func__);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+ bch2_err_str(ret));
if (ret)
return ret;
}
@@ -1020,7 +1020,7 @@ static int __trigger_extent(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
}
if (ret)
@@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ new.k->p, mod > 0);
if (ret)
return ret;
}
@@ -1335,7 +1336,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+ kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1346,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
- if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO))) {
+ if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
if ((c->opts.buckets_nouse &&
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)))) {
+ !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO)))) {
ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
}
@@ -1397,8 +1398,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
- kvpfree(buckets_nouse,
- BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@@ -1407,27 +1407,21 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
- unsigned i;
-
- kvpfree(ca->buckets_nouse,
- BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
- sizeof(struct bucket_gens) + ca->mi.nbuckets);
+ kvfree(ca->buckets_nouse);
+ kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
-
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -BCH_ERR_ENOMEM_usage_init;
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f7..f9af5adabe 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
fallthrough;
case BCH_WATERMARK_btree_copygc:
case BCH_WATERMARK_reclaim:
+ case BCH_WATERMARK_interior_updates:
break;
}
@@ -394,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
: "(invalid data type)";
}
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
- if (type < BCH_DATA_NR)
- prt_str(out, __bch2_data_types[type]);
- else
- prt_printf(out, "(invalid data type %u)", type);
-}
-
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c176..4d14f19f51 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,7 +7,7 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -22,12 +22,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -140,39 +134,51 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
- char **devs;
- size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
kfree(thr);
}
-static int bch2_fsck_offline_thread_fn(void *arg)
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
- thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
- if (!thr->thr.thr.ret)
- bch2_fs_stop(c);
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
- thread_with_stdio_done(&thr->thr);
- return 0;
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
+
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
+ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_offline_thread_fn,
+};
+
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
+ darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -184,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
}
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
}
+ thr->opts = bch2_opts_empty();
+
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
@@ -220,17 +229,28 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_offline_thread_fn);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+ ret = __bch2_run_thread_with_stdio(&thr->thr);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
@@ -763,9 +783,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
-static int bch2_fsck_online_thread_fn(void *arg)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
{
- struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = thr->c;
c->stdio_filter = current;
@@ -793,13 +813,16 @@ static int bch2_fsck_online_thread_fn(void *arg)
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
- thread_with_stdio_done(&thr->thr);
-
up(&c->online_fsck_mutex);
bch2_ro_ref_put(c);
- return 0;
+ return ret;
}
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+ .exit = bch2_fsck_thread_exit,
+ .fn = bch2_fsck_online_thread_fn,
+};
+
static long bch2_ioctl_fsck_online(struct bch_fs *c,
struct bch_ioctl_fsck_online arg)
{
@@ -840,9 +863,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
- ret = bch2_run_thread_with_stdio(&thr->thr,
- bch2_fsck_thread_exit,
- bch2_fsck_online_thread_fn);
+ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
err:
if (ret < 0) {
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1..088fd2e7bd 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -102,6 +102,7 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
int ret;
skcipher_request_set_sync_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
@@ -429,15 +430,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo,
- bch2_csum_types[crc_old.csum_type],
- bch2_csum_types[new_csum_type]);
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ __func__,
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo);
+ bch2_prt_csum_type(&buf, crc_old.csum_type);
+ prt_str(&buf, " new type ");
+ bch2_prt_csum_type(&buf, new_csum_type);
+ prt_str(&buf, ")");
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
@@ -558,7 +564,7 @@ got_key:
return 0;
}
-#include "../crypto.h"
+#include "crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016..e40499fde9 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
struct bch_csum expected,
struct bch_csum got)
{
- prt_printf(out, "checksum error: got ");
+ prt_str(out, "checksum error, type ");
+ bch2_prt_csum_type(out, type);
+ prt_str(out, ": got ");
bch2_csum_to_text(out, type, got);
prt_str(out, " should be ");
bch2_csum_to_text(out, type, expected);
- prt_printf(out, " type %s", bch2_csum_types[type]);
}
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86b..1410365a88 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
+ mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
- if (mempool_init_kvpmalloc_pool(
+ if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
if (!mempool_initialized(&c->decompress_workspace) &&
- mempool_init_kvpmalloc_pool(&c->decompress_workspace,
- 1, decompress_workspace_size))
+ mempool_init_kvmalloc_pool(&c->decompress_workspace,
+ 1, decompress_workspace_size))
return -BCH_ERR_ENOMEM_decompression_workspace_init;
return 0;
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb4557..607fd5e232 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
- if (type < BCH_COMPRESSION_TYPE_NR)
- prt_str(out, __bch2_compression_types[type]);
- else
- prt_printf(out, "(invalid compression type %u)", type);
-}
-
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42..0022b51ce3 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -14,6 +14,7 @@
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
@@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned ptrs_locked = 0;
int ret = 0;
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -571,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- (!atomic_read(&ctxt->read_sectors) &&
- !atomic_read(&ctxt->write_sectors)));
+ list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
@@ -590,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -599,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ !durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
@@ -608,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
- BUG_ON(!m->op.nr_replicas);
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507f..cd99b73994 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -137,7 +138,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -170,7 +171,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
printbuf_exit(&buf);
}
out:
@@ -199,7 +200,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+ n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@@ -293,7 +294,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_buf_bytes(b));
+ kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}
@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002..d37bd07afb 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -144,19 +144,21 @@ fsck_err:
return ret;
}
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> %llu type %s",
- d_name.len,
- d_name.name,
- d.v->d_type != DT_SUBVOL
- ? le64_to_cpu(d.v->d_inum)
- : le32_to_cpu(d.v->d_child_subvol),
- bch2_d_type_str(d.v->d_type));
+ prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+ if (d.v->d_type != DT_SUBVOL)
+ prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ else
+ prt_printf(out, "%u -> %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ le32_to_cpu(d.v->d_child_subvol));
+
+ prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u64 dir, u32 snapshot,
+ u32 dir_subvol, u64 dir, u32 snapshot,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{
- subvol_inum zero_inum = { 0 };
+ subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
- ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- zero_inum, snapshot,
- &dirent->k_i, str_hash_flags,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+ unsigned src_update_flags = 0;
+ bool delete_src, delete_dst;
int ret = 0;
- if (src_dir.subvol != dst_dir.subvol)
- return -EXDEV;
-
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
@@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
- src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
- if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
- return -EOPNOTSUPP;
-
-
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
-
- dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
- if (dst_type == DT_SUBVOL)
- return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
+ if (new_dst->v.d_type == DT_SUBVOL)
+ new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ new_src->v.d_type == DT_SUBVOL)
+ new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
-
/*
- * If we're deleting a subvolume, we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot:
+ * If we're deleting a subvolume we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot - there can only be
+ * single dirent that points to a given subvolume.
+ *
+ * IOW, we don't maintain multiple versions in different snapshots of
+ * dirents that point to subvolumes - dirents that point to subvolumes
+ * are only visible in one particular subvolume so it's not necessary,
+ * and it would be particularly confusing for fsck to have to deal with.
*/
- if (src_type == DT_SUBVOL) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter);
+ delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+ new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+ delete_dst = old_dst.k &&
+ bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+ new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+ if (!delete_src || !bkey_deleted(&new_src->k)) {
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
+ }
- new_src->k.p = src_iter.pos;
- src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ if (delete_src) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
}
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
+ if (delete_dst) {
+ bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ if (ret)
+ goto out;
+ }
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@@ -456,41 +472,29 @@ out:
return ret;
}
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- return ret;
-
- ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
+ int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
if (ret)
return ret;
- k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
- d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, inum);
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
-
return ret;
}
@@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
- __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
- ret = -ENOTEMPTY;
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+ continue;
+ ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+ bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f0..bee55cca2a 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
@@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af2700..556a217108 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -131,29 +131,33 @@ fsck_err:
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+ struct bch_stripe s = {};
+
+ memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+ unsigned nr_data = s.nr_blocks - s.nr_redundant;
+
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+ s.algorithm,
+ le16_to_cpu(s.sectors),
+ nr_data,
+ s.nr_redundant);
+ bch2_prt_csum_type(out, s.csum_type);
+ prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+ for (unsigned i = 0; i < s.nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+ if ((void *) ptr >= bkey_val_end(k))
+ break;
+
+ bch2_extent_ptr_to_text(out, c, ptr);
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
- s->algorithm,
- le16_to_cpu(s->sectors),
- nr_data,
- s->nr_redundant,
- s->csum_type,
- 1U << s->csum_granularity_bits);
-
- for (i = 0; i < s->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = s->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
- if (i < nr_data)
- prt_printf(out, "#%u", stripe_blockcount_get(s, i));
- prt_printf(out, " gen %u", ptr->gen);
- if (ptr_stale(ca, ptr))
- prt_printf(out, " stale");
+ if (s.csum_type < BCH_CSUM_NR &&
+ i < nr_data &&
+ stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+ prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
}
}
@@ -448,7 +452,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
printbuf_exit(&buf);
return ret;
}
@@ -504,7 +508,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
- kvpfree(buf->data[i], buf->size << 9);
+ kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@@ -531,7 +535,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}
@@ -607,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
- prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
- want.hi, want.lo,
- got.hi, got.lo,
- bch2_csum_types[v->csum_type]);
+ prt_str(&err, "stripe ");
+ bch2_csum_err_msg(&err, v->csum_type, want, got);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
@@ -1868,10 +1870,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
return -BCH_ERR_stripe_alloc_blocked;
ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+ "reading stripe key: %s", bch2_err_str(ret));
if (ret) {
bch2_stripe_close(c, h->s);
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
return ret;
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e8..f042616888 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
+ EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index d260ff9bbf..43557bebd0 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "errcode.h"
+#include "trace.h"
#include <linux/errname.h>
@@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
return err == class;
}
-int __bch2_err_class(int err)
+int __bch2_err_class(int bch_err)
{
- err = -err;
- BUG_ON((unsigned) err >= BCH_ERR_MAX);
+ int std_err = -bch_err;
+ BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
- while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
- err = bch2_errcode_parents[err - BCH_ERR_START];
+ while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+ std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+ trace_error_downcast(bch_err, std_err, _RET_IP_);
- return -err;
+ return -std_err;
}
const char *bch2_blk_status_to_str(blk_status_t status)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a..dbe35b80bc 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
+ x(EINVAL, mount_option) \
+ x(BCH_ERR_mount_option, option_name) \
+ x(BCH_ERR_mount_option, option_value) \
+ x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@@ -78,6 +82,7 @@
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -109,6 +114,8 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
+ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
+ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
@@ -168,6 +175,7 @@
x(EINVAL, block_size_too_small) \
x(EINVAL, bucket_size_too_small) \
x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_size_too_big) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
@@ -176,6 +184,9 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
+ x(EINVAL, remove_with_metadata_missing_unimplemented)\
+ x(EINVAL, remove_would_lose_data) \
+ x(EINVAL, btree_iter_with_journal_not_supported) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -225,7 +236,10 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
x(EIO, sb_not_downgraded) \
- x(EIO, btree_write_all_failed) \
+ x(EIO, btree_node_write_all_failed) \
+ x(EIO, btree_node_read_error) \
+ x(EIO, btree_node_read_validate_error) \
+ x(EIO, btree_need_topology_repair) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -238,7 +252,9 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem)
+ x(BCH_ERR_nopromote, nopromote_enomem) \
+ x(0, need_inode_lock) \
+ x(0, invalid_snapshot_node)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe4..82a6656c94 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
+#include "journal.h"
+#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -15,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+ journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -25,11 +28,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
bch2_inconsistent_error(c);
+ return -BCH_ERR_btree_need_topology_repair;
+ } else {
+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+ -BCH_ERR_btree_node_read_validate_error;
+ }
}
void bch2_fatal_error(struct bch_fs *c)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353..36caedf72d 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -30,7 +30,13 @@ struct work_struct;
bool bch2_inconsistent_error(struct bch_fs *);
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_topology_error(c, ...) \
+({ \
+ bch_err(c, "btree topology error: " __VA_ARGS__); \
+ bch2_topology_error(c); \
+})
#define bch2_fs_inconsistent(c, ...) \
({ \
@@ -191,9 +197,9 @@ do { \
void bch2_fatal_error(struct bch_fs *);
-#define bch2_fs_fatal_error(c, ...) \
+#define bch2_fs_fatal_error(c, _msg, ...) \
do { \
- bch_err(c, __VA_ARGS__); \
+ bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113d..1a331e5392 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
- btree_ptr_v2_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+ c, err, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+ c, err, btree_ptr_v2_min_key_bad,
+ "min_key > key");
+
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
@@ -973,6 +978,33 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
return bkey_deleted(k.k);
}
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
+
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (b >= ca->mi.first_bucket &&
+ b < ca->mi.nbuckets &&
+ ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -988,42 +1020,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
- struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
- ? bch_dev_bkey_exists(c, ptr->dev)
- : NULL;
-
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- if (ca && ptr_stale(ca, ptr))
- prt_printf(out, " stale");
- }
+ case BCH_EXTENT_ENTRY_ptr:
+ bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
break;
- }
+
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
crc.compressed_size,
crc.uncompressed_size,
- crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type]);
+ crc.offset, crc.nonce);
+ bch2_prt_csum_type(out, crc.csum_type);
+ prt_str(out, " compress ");
bch2_prt_compression_type(out, crc.compression_type);
break;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6bf839d69e..528e817eac 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -43,6 +43,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+#define extent_entry_next_safe(_entry, _end) \
+ (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
+ ? extent_entry_next(_entry) \
+ : _end)
+
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@@ -103,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
}
static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+ return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
+ switch (__extent_entry_type(e)) {
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
@@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (extent_entry_type(_entry)) { \
+ switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@@ -344,7 +349,7 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next(_entry))
+ (_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
@@ -591,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
return ret;
}
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -695,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 0000000000..0f955c3c76
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+ unsigned char lsbits = (unsigned char)size;
+
+ (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+ return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory. This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+ do {
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+ } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory. This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible. If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI). Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+ do {
+#ifdef CONFIG_64BIT
+ u64 t = *(u64 *)(a + (n -= 8));
+ *(u64 *)(a + n) = *(u64 *)(b + n);
+ *(u64 *)(b + n) = t;
+#else
+ /* Use two 32-bit transfers to avoid base+index+4 addressing */
+ u32 t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+
+ t = *(u32 *)(a + (n -= 4));
+ *(u32 *)(a + n) = *(u32 *)(b + n);
+ *(u32 *)(b + n) = t;
+#endif
+ } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+ do {
+ char t = ((char *)a)[--n];
+ ((char *)a)[n] = ((char *)b)[n];
+ ((char *)b)[n] = t;
+ } while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES (swap_r_func_t)2
+#define SWAP_WRAPPER (swap_r_func_t)3
+
+struct wrapper {
+ cmp_func_t cmp;
+ swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+ if (swap_func == SWAP_WRAPPER) {
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+ return;
+ }
+
+ if (swap_func == SWAP_WORDS_64)
+ swap_words_64(a, b, size);
+ else if (swap_func == SWAP_WORDS_32)
+ swap_words_32(a, b, size);
+ else if (swap_func == SWAP_BYTES)
+ swap_bytes(a, b, size);
+ else
+ swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+ if (cmp == _CMP_WRAPPER)
+ return ((const struct wrapper *)priv)->cmp(a, b);
+ return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, c, r;
+
+ /* called from 'sort' without swap function, let's pick the default */
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+ swap_func = NULL;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap_func = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf8..24840aee33 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;
- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);
/*
* sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);
return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -244,21 +252,38 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
-
- if (i == eytzinger0_first(nr))
- return -1;
-
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
@@ -269,13 +294,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
int _res; \
\
while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size, _size))) \
+ (_res = _cmp(_search, _base + _i * _size))) \
_i = eytzinger0_child(_i, _res > 0); \
_i; \
})
-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10..d8153fe270 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
- (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
+ (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
- kvpfree((fifo)->data, fifo_buf_size(fifo)); \
+ kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c6..624e6f9632 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ dir.subvol,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name,
- bool deleting_snapshot)
+ bool deleting_subvol)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_INTENT);
+ ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
- if (deleting_snapshot && !inode_u->bi_subvol) {
+ if (deleting_subvol && !inode_u->bi_subvol) {
ret = -BCH_ERR_ENOENT_not_subvol;
goto err;
}
- if (deleting_snapshot || inode_u->bi_subvol) {
+ if (inode_u->bi_subvol) {
+ /* Recursive subvolume destroy not allowed (yet?) */
+ ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_subvol || inode_u->bi_subvol) {
ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
if (ret)
goto err;
@@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
return ret;
}
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+ struct btree_iter iter;
+ struct bkey_i_subvolume *s =
+ bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvol),
+ BTREE_ITER_CACHED, subvolume);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ s->v.fs_path_parent = cpu_to_le32(new_parent);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
int bch2_rename_trans(struct btree_trans *trans,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
+ if (src_inode_u->bi_subvol &&
+ dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ dst_inode_u->bi_subvol &&
+ src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+ ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+ if (ret)
+ goto err;
+ }
+
+ /* Can't move across subvolumes, unless it's a subvolume root: */
+ if (src_dir.subvol != dst_dir.subvol &&
+ (!src_inode_u->bi_subvol ||
+ (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+ ret = -EXDEV;
+ goto err;
+ }
+
+ if (src_inode_u->bi_parent_subvol)
+ src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+ if ((mode == BCH_RENAME_EXCHANGE) &&
+ dst_inode_u->bi_parent_subvol)
+ dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;
@@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inum)) {
- ret = -ENOTEMPTY;
- goto err;
+ if (S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, dst_inum);
+ if (ret)
+ goto err;
}
}
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 27710cdd57..39292e7ef3 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
- loff_t pos, unsigned len)
+ loff_t pos, unsigned len,
+ bool inode_locked)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
+ /*
+ * If we're not using the inode lock, we need to lock all the folios for
+ * atomiticity of writes vs. other writes:
+ */
+ if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+ ret = -BCH_ERR_need_inode_lock;
+ goto out;
+ }
+
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
+ if (end > inode->v.i_size) {
+ BUG_ON(!inode_locked);
i_size_write(&inode->v, end);
+ }
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
+ loff_t pos;
+ bool inode_locked = false;
+ ssize_t written = 0, written2 = 0, ret = 0;
+
+ /*
+ * We don't take the inode lock unless i_size will be changing. Folio
+ * locks provide exclusion with other writes, and the pagecache add lock
+ * provides exclusion with truncate and hole punching.
+ *
+ * There is one nasty corner case where atomicity would be broken
+ * without great care: when copying data from userspace to the page
+ * cache, we do that with faults disable - a page fault would recurse
+ * back into the filesystem, taking filesystem locks again, and
+ * deadlock; so it's done with faults disabled, and we fault in the user
+ * buffer when we aren't holding locks.
+ *
+ * If we do part of the write, but we then race and in the userspace
+ * buffer have been evicted and are no longer resident, then we have to
+ * drop our folio locks to re-fault them in, breaking write atomicity.
+ *
+ * To fix this, we restart the write from the start, if we weren't
+ * holding the inode lock.
+ *
+ * There is another wrinkle after that; if we restart the write from the
+ * start, and then get an unrecoverable error, we _cannot_ claim to
+ * userspace that we did not write data we actually did - so we must
+ * track (written2) the most we ever wrote.
+ */
+
+ if ((iocb->ki_flags & IOCB_APPEND) ||
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ }
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+ if (ret) {
+ if (!inode_locked) {
+ inode_lock(&inode->v);
+ inode_locked = true;
+ ret = file_remove_privs_flags(file, 0);
+ }
+ if (ret)
+ goto unlock;
+ }
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ pos = iocb->ki_pos;
bch2_pagecache_add_get(inode);
+ if (!inode_locked &&
+ (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+ goto get_inode_lock;
+
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ again:
}
}
+ if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+ goto get_inode_lock;
+
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+ if (ret == -BCH_ERR_need_inode_lock)
+ goto get_inode_lock;
if (unlikely(ret < 0))
break;
@@ -1030,50 +1103,46 @@ again:
}
pos += ret;
written += ret;
+ written2 = max(written, written2);
+
+ if (ret != bytes && !inode_locked)
+ goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
+ if (0) {
+get_inode_lock:
+ bch2_pagecache_add_put(inode);
+ inode_lock(&inode->v);
+ inode_locked = true;
+ bch2_pagecache_add_get(inode);
+
+ iov_iter_revert(iter, written);
+ pos -= written;
+ written = 0;
+ ret = 0;
+ }
+ } while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
+unlock:
+ if (inode_locked)
+ inode_unlock(&inode->v);
+
+ iocb->ki_pos += written;
- return written ? written : ret;
+ ret = max(written, written2) ?: ret;
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
+ ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+ ? bch2_direct_write(iocb, iter)
+ : bch2_buffered_write(iocb, iter);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5..b889370a50 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
+ bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -536,7 +538,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
if (likely(!dio->iter.count) || dio->op.error)
break;
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
}
out:
return bch2_dio_write_done(dio);
@@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ return -EROFS;
+
inode_lock(&inode->v);
ret = generic_write_checks(req, iter);
if (unlikely(ret <= 0))
- goto err;
+ goto err_put_write_ref;
ret = file_remove_privs(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
ret = file_update_time(file);
if (unlikely(ret))
- goto err;
+ goto err_put_write_ref;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
+ goto err_put_write_ref;
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
@@ -618,7 +623,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
@@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
ret = bch2_dio_write_loop(dio);
-err:
+out:
if (locked)
inode_unlock(&inode->v);
return ret;
@@ -653,7 +658,9 @@ err_put_bio:
bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
- goto err;
+err_put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ goto out;
}
void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 8cbaba6565..828c3d7c8f 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
- unsigned nr_replicas:4;
-
+ u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- unsigned replicas_reserved:4;
-
- /* i_sectors: */
- enum bch_folio_sector_state state:8;
+ replicas_reserved:4;
+ u8 state;
};
struct bch_folio {
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a..20b4047742 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_info *inode)
{
- struct bch_inode_unpacked u;
- int ret;
-
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
- if (ret)
- return ret;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ return -EROFS;
- return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
- bch2_inode_flush_nocow_writes(c, inode);
+ struct bch_inode_unpacked u;
+ int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ return ret;
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542d..6f114803c6 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "inode %u:%llu not found when updating",
+ "%s: inode %u:%llu not found when updating",
+ bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -176,45 +177,91 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
- struct bch_inode_unpacked inode_u;
- struct bch_inode_info *inode;
- struct btree_trans *trans;
- struct bch_subvolume subvol;
- int ret;
+ subvol_inum inum = inode_inum(inode);
+ struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
+ BUG_ON(!old);
- inode = to_bch_ei(iget5_locked(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->v.i_state & I_NEW))
- return &inode->v;
+ if (unlikely(old != inode)) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ inode = old;
+ } else {
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+ /*
+ * we really don't want insert_inode_locked2() to be setting
+ * I_NEW...
+ */
+ unlock_new_inode(&inode->v);
+ }
- trans = bch2_trans_get(c);
- ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+ return inode;
+}
- if (!ret)
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do) \
+({ \
+ unsigned _saved_flags = memalloc_flags_save(_flags); \
+ typeof(_do) _ret = _do; \
+ memalloc_noreclaim_restore(_saved_flags); \
+ _ret; \
+})
- if (ret) {
- iget_failed(&inode->v);
- return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_info *inode =
+ memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+ to_bch_ei(new_inode(c->vfs_sb)));
+
+ if (unlikely(!inode)) {
+ int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+ if (ret && inode) {
+ __destroy_inode(&inode->v);
+ kmem_cache_free(bch2_inode_cache, inode);
+ }
+ if (ret)
+ return ERR_PTR(ret);
}
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
+ return inode;
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ return &inode->v;
- unlock_new_inode(&inode->v);
+ struct btree_trans *trans = bch2_trans_get(c);
- return &inode->v;
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+ int ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (!ret) {
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+ }
+ bch2_trans_put(trans);
+
+ return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@@ -226,7 +273,7 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode, *old;
+ struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
@@ -293,7 +340,6 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
- bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +350,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
-
- inode->v.i_state |= I_CREATING;
-
- old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
-
- if (unlikely(old != inode)) {
- /*
- * We raced, another process pulled the new inode into cache
- * before us:
- */
- make_bad_inode(&inode->v);
- iput(&inode->v);
-
- inode = old;
- } else {
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- /*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
- */
- unlock_new_inode(&inode->v);
- }
-
+ inode = bch2_inode_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -352,23 +369,78 @@ err_trans:
/* methods */
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_hash_info *dir_hash_info,
+ const struct qstr *name)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dirent_iter = {};
+ subvol_inum inum = {};
+
+ int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+ dir_hash_info, dir, name, 0);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ struct bch_inode_info *inode =
+ to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+ if (inode)
+ goto out;
+
+ struct bch_subvolume subvol;
+ struct bch_inode_unpacked inode_u;
+ ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ if (bch2_err_matches(ret, ENOENT)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "%s points to missing inode", buf.buf);
+ printbuf_exit(&buf);
+ }
+ if (ret)
+ goto err;
+
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ inode = bch2_inode_insert(c, inode);
+out:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ return inode;
+err:
+ inode = ERR_PTR(ret);
+ goto out;
+}
+
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
- struct inode *vinode = NULL;
- subvol_inum inum = { .subvol = 1 };
- int ret;
- ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
- &dentry->d_name, &inum);
-
- if (!ret)
- vinode = bch2_vfs_inode_get(c, inum);
+ struct bch_inode_info *inode;
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+ &hash, &dentry->d_name)));
+ if (IS_ERR(inode))
+ inode = NULL;
- return d_splice_alias(vinode, dentry);
+ return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@@ -892,7 +964,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
@@ -902,6 +973,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (ret)
return ret;
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
@@ -1372,6 +1444,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
+ bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1572,7 +1645,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
- u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -1583,10 +1655,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
- fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
- le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@@ -1805,8 +1874,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
ret = bch2_parse_mount_opts(NULL, &opts, data);
- if (ret)
+ if (ret) {
+ ret = bch2_err_class(ret);
return ERR_PTR(ret);
+ }
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
@@ -1882,6 +1953,7 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
+ sb->s_uuid = c->sb.user_uuid;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -1928,6 +2000,7 @@ out:
return dget(sb->s_root);
err_put_super:
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
return ERR_PTR(bch2_err_class(ret));
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777ba..8e2010212c 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
- int ret;
-
- ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+ int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
@@ -100,8 +98,8 @@ err:
}
static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -142,34 +140,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __write_inode(trans, inode, snapshot));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
@@ -186,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
err:
bch_err_fn(c, ret);
@@ -197,7 +168,8 @@ err:
/* Get lost+found, create if it doesn't exist: */
static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound)
+ struct bch_inode_unpacked *lostfound,
+ u64 reattaching_inum)
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
@@ -212,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
return ret;
subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
- u32 subvol_snapshot;
- ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
- &subvol_snapshot, &root_inum.inum);
- bch_err_msg(c, ret, "looking up root subvol");
+ struct bch_subvolume subvol;
+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+ false, 0, &subvol);
+ bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+ le32_to_cpu(st.master_subvol), snapshot);
if (ret)
return ret;
+ if (!subvol.inode) {
+ struct btree_iter iter;
+ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.inode = cpu_to_le64(reattaching_inum);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ root_inum.inum = le64_to_cpu(subvol.inode);
+
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
u32 root_inode_snapshot = snapshot;
ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
- bch_err_msg(c, ret, "looking up root inode");
+ bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+ root_inum.inum, le32_to_cpu(st.master_subvol));
if (ret)
return ret;
@@ -280,7 +269,7 @@ create_lostfound:
goto err;
ret = bch2_dirent_create_snapshot(trans,
- root_inode.bi_inum, snapshot, &root_hash_info,
+ 0, root_inode.bi_inum, snapshot, &root_hash_info,
mode_to_type(lostfound->bi_mode),
&lostfound_str,
lostfound->bi_inum,
@@ -303,30 +292,47 @@ static int reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 dirent_snapshot = inode_snapshot;
int ret;
- ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+ if (inode->bi_subvol) {
+ inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ u64 root_inum;
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &dirent_snapshot, &root_inum);
+ if (ret)
+ return ret;
+
+ snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+ } else {
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+ }
+
+ ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
if (ret)
return ret;
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
- ret = __write_inode(trans, &lostfound, U32_MAX);
+ ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
dir_hash = bch2_hash_info_init(trans->c, &lostfound);
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
ret = bch2_dirent_create_snapshot(trans,
- lostfound.bi_inum, inode_snapshot,
+ inode->bi_parent_subvol, lostfound.bi_inum,
+ dirent_snapshot,
&dir_hash,
inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
+ &name,
+ inode->bi_subvol ?: inode->bi_inum,
+ &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -334,7 +340,7 @@ static int reattach_inode(struct btree_trans *trans,
inode->bi_dir = lostfound.bi_inum;
inode->bi_dir_offset = dir_offset;
- return __write_inode(trans, inode, inode_snapshot);
+ return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
}
static int remove_backpointer(struct btree_trans *trans,
@@ -353,6 +359,133 @@ static int remove_backpointer(struct btree_trans *trans,
return ret;
}
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_fs *c = trans->c;
+
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ ret = remove_backpointer(trans, &inode);
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
+ return ret;
+
+ ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ return ret;
+}
+
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+ bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ /*
+ * If inum isn't set, that means we're being called from check_dirents,
+ * not check_inodes - the root of this subvolume doesn't exist or we
+ * would have found it there:
+ */
+ if (!inum) {
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked new_inode;
+ u64 cpu = raw_smp_processor_id();
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+ new_inode.bi_subvol = subvolid;
+
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, &new_inode);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ if (ret)
+ return ret;
+
+ inum = new_inode.bi_inum;
+ }
+
+ bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+ struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ int ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ return ret;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->k.p.offset = subvolid;
+ new_subvol->v.snapshot = cpu_to_le32(snapshotid);
+ new_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+ if (ret)
+ return ret;
+
+ struct btree_iter iter;
+ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshots, POS(0, snapshotid),
+ 0, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+ if (ret)
+ return ret;
+
+ u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+ s->v.subvol = cpu_to_le32(subvolid);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+ bch2_trans_iter_exit(trans, &iter);
+
+ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+ 0, snapshot_tree);
+ ret = PTR_ERR_OR_ZERO(st);
+ bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+ if (ret)
+ return ret;
+
+ if (!st->v.master_subvol)
+ st->v.master_subvol = cpu_to_le32(subvolid);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked new_inode;
+
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+ new_inode.bi_size = size;
+ new_inode.bi_inum = inum;
+
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+ struct btree_iter iter = {};
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
+
struct snapshots_seen_entry {
u32 id;
u32 equiv;
@@ -592,13 +725,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
}
static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
- u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
{
- struct inode_walker_entry *i;
-
- snapshot = bch2_snapshot_equiv(c, snapshot);
+ bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+ u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+ struct inode_walker_entry *i;
__darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -609,20 +741,24 @@ found:
if (snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- size_t pos;
- int ret;
new.snapshot = snapshot;
new.count = 0;
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
- w->last_pos.inode, snapshot, i->snapshot);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+ printbuf_exit(&buf);
while (i > w->inodes.data && i[-1].snapshot > snapshot)
--i;
- pos = i - w->inodes.data;
- ret = darray_insert_item(&w->inodes, pos, new);
+ size_t pos = i - w->inodes.data;
+ int ret = darray_insert_item(&w->inodes, pos, new);
if (ret)
return ERR_PTR(ret);
@@ -633,21 +769,21 @@ found:
}
static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w, struct bpos pos,
- bool is_whiteout)
+ struct inode_walker *w,
+ struct bkey_s_c k)
{
- if (w->last_pos.inode != pos.inode) {
- int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, pos)) {
+ } else if (bkey_cmp(w->last_pos, k.k->p)) {
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
}
- w->last_pos = pos;
+ w->last_pos = k.k->p;
- return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+ return lookup_inode_for_snapshot(trans->c, w, k);
}
static int __get_visible_inodes(struct btree_trans *trans,
@@ -722,7 +858,7 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_snapshot(trans, desc, hash_info,
+ bch2_hash_set_in_snapshot(trans, desc, hash_info,
(subvol_inum) { 0, k.k->p.inode },
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
@@ -795,16 +931,93 @@ fsck_err:
goto out;
}
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
+{
+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ u32 *snapshot)
+{
+ if (inode->bi_subvol) {
+ u64 inum;
+ int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+ if (ret)
+ return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+ }
+
+ return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k);
- if (ret)
+ int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot, bool *write_inode)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ struct btree_iter dirent_iter = {};
+ struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+ int ret = bkey_err(d);
+ if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- bch2_trans_iter_exit(trans, &iter);
- return k.k->type == KEY_TYPE_set;
+ if (fsck_err_on(ret,
+ c, inode_points_to_missing_dirent,
+ "inode points to missing dirent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
+ fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ c, inode_points_to_wrong_dirent,
+ "inode points to dirent that does not point back:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, inode_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ /*
+ * We just clear the backpointer fields for now. If we find a
+ * dirent that points to this inode in check_dirents(), we'll
+ * update it then; then when we get to check_path() if the
+ * backpointer is still 0 we'll reattach it.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
+ *write_inode = true;
+ }
+
+ ret = 0;
+fsck_err:
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
}
static int check_inode(struct btree_trans *trans,
@@ -861,7 +1074,8 @@ static int check_inode(struct btree_trans *trans,
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -876,7 +1090,7 @@ static int check_inode(struct btree_trans *trans,
if (ret < 0)
return ret;
- fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
ret = 0;
@@ -950,8 +1164,54 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ if (u.bi_dir || u.bi_dir_offset) {
+ ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(u.bi_parent_subvol &&
+ (u.bi_subvol == 0 ||
+ u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+ c, inode_bi_parent_nonzero,
+ "inode %llu:%u has subvol %u but nonzero parent subvol %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+
+ if (u.bi_subvol) {
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+ goto do_update;
+ }
+
+ if (fsck_err_on(ret,
+ c, inode_bi_subvol_missing,
+ "inode %llu:%u bi_subvol points to missing subvolume %u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+ fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+ !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+ k.k->p.snapshot),
+ c, inode_bi_subvol_wrong,
+ "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+ u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+ le64_to_cpu(s.inode),
+ le32_to_cpu(s.snapshot))) {
+ u.bi_subvol = 0;
+ u.bi_parent_subvol = 0;
+ do_update = true;
+ }
+ }
+do_update:
if (do_update) {
- ret = __write_inode(trans, &u, iter->pos.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
return ret;
@@ -982,32 +1242,9 @@ int bch2_check_inodes(struct bch_fs *c)
return ret;
}
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1021,8 +1258,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
i->count = count2;
if (i->count != count2) {
- bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
}
@@ -1032,14 +1269,21 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_i_sectors_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
}
struct extent_end {
@@ -1255,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
goto err;
}
- ret = extent_ends_at(c, extent_ends, seen, k);
- if (ret)
- goto err;
-
extent_ends->last_pos = k.k->p;
err:
return ret;
@@ -1312,7 +1552,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
goto err;
@@ -1322,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
@@ -1388,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
i->seen_this_pos = true;
}
+
+ if (k.k->type != KEY_TYPE_whiteout) {
+ ret = extent_ends_at(c, extent_ends, s, k);
+ if (ret)
+ goto err;
+ }
out:
err:
fsck_err:
@@ -1423,7 +1680,7 @@ int bch2_check_extents(struct bch_fs *c)
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
- check_i_sectors(trans, &w));
+ check_i_sectors_notnested(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
@@ -1453,10 +1710,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1469,8 +1725,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
return count2;
if (i->count != count2) {
- bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
- i->count, count2);
+ bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+ w->last_pos.inode, i->snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1481,96 +1737,123 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+ ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
bch_err_fn(c, ret);
- return ret ?: trans_was_restarted(trans, restart_count);
+ return ret;
}
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- u32 target_snapshot)
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ u32 restart_count = trans->restart_count;
+ return check_subdir_count_notnested(trans, w) ?:
+ trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
{
struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
int ret = 0;
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
if (!target->bi_dir &&
!target->bi_dir_offset) {
target->bi_dir = d.k->p.inode;
target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
+ return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- if (!inode_points_to_dirent(target, d)) {
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
+ struct btree_iter bp_iter = { NULL };
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
- bool backpointer_exists = !ret;
- ret = 0;
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ goto out;
+ }
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (fsck_err_on(backpointer_exists &&
+ (S_ISDIR(target->bi_mode) ||
+ target->bi_subvol),
+ c, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target_snapshot, buf.buf)) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto out;
+ }
- if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
- c, inode_dir_multiple_links,
- "directory %llu:%u with multiple links\n%s",
- target->bi_inum, target_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- c, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(!backpointer_exists,
- c, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
-
- ret = __write_inode(trans, target, target_snapshot);
- if (ret)
- goto err;
- }
- }
+ ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+ if (ret)
+ goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
c, dirent_d_type_wrong,
@@ -1586,6 +1869,12 @@ static int check_dirent_target(struct btree_trans *trans,
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
@@ -1593,33 +1882,163 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
- if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
- c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol)) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+ bch2_trans_iter_exit(trans, &iter);
+ *subvolid = k.k->p.offset;
+ goto found;
+ }
+ }
+ if (!ret)
+ ret = -ENOENT;
+found:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c_dirent d)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter subvol_iter = {};
+ struct bch_inode_unpacked subvol_root;
+ u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 parent_snapshot;
+ u32 new_parent_subvol = 0;
+ u64 parent_inum;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret ||
+ (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+ int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+ if (ret2 && !bch2_err_matches(ret, ENOENT))
+ return ret2;
+ }
+
+ if (ret &&
+ !new_parent_subvol &&
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+ /*
+ * Couldn't find a subvol for dirent's snapshot - but we lost
+ * subvols, so we need to reconstruct:
+ */
+ ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+ if (ret)
+ return ret;
+
+ parent_snapshot = d.k->p.snapshot;
+ }
+
+ if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+ "dirent parent_subvol points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+ fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+ c, dirent_not_visible_in_parent_subvol,
+ "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+ parent_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ if (!new_parent_subvol) {
+ bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
+ struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+ ret = PTR_ERR_OR_ZERO(new_dirent);
if (ret)
goto err;
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+ }
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ struct bkey_s_c_subvolume s =
+ bch2_bkey_get_iter_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, target_subvol),
+ 0, subvolume);
+ ret = bkey_err(s.s_c);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (ret) {
+ if (fsck_err(c, dirent_to_missing_subvol,
+ "dirent points to missing subvolume\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+ return __remove_dirent(trans, d.k->p);
+ ret = 0;
+ goto out;
+ }
+
+ if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+ c, subvol_fs_path_parent_wrong,
+ "subvol with wrong fs_path_parent, should be be %u\n%s",
+ parent_subvol,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- d = dirent_i_to_s_c(n);
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
+
+ u64 target_inum = le64_to_cpu(s.v->inode);
+ u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+ ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ if (ret) {
+ bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = 0;
+ goto err;
+ }
+
+ if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
+ c, inode_bi_parent_wrong,
+ "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_parent_subvol, parent_subvol)) {
+ subvol_root.bi_parent_subvol = parent_subvol;
+ ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ goto err;
}
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ goto err;
out:
err:
fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &subvol_iter);
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -1631,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
struct bpos equiv;
@@ -1661,7 +2079,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
- i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, dir, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret < 0)
goto err;
@@ -1670,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ dir->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
@@ -1704,53 +2133,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_dirent)
goto out;
- d = bkey_s_c_to_dirent(k);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL) {
- struct bch_inode_unpacked subvol_root;
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 target_snapshot;
- u64 target_inum;
-
- ret = subvol_lookup(trans, target_subvol,
- &target_snapshot, &target_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, dirent_to_missing_subvol,
- "dirent points to missing subvolume %u",
- le32_to_cpu(d.v->d_child_subvol))) {
- ret = __remove_dirent(trans, d.k->p);
- goto err;
- }
-
- ret = lookup_inode(trans, target_inum,
- &subvol_root, &target_snapshot);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c, subvol_to_missing_root,
- "subvolume %u points to missing subvolume root %llu",
- target_subvol,
- target_inum)) {
- bch_err(c, "repair not implemented yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
- c, subvol_root_wrong_bi_subvol,
- "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
- target_inum,
- subvol_root.bi_subvol, target_subvol)) {
- subvol_root.bi_subvol = target_subvol;
- ret = __write_inode(trans, &subvol_root, target_snapshot);
- if (ret)
- goto err;
- }
-
- ret = check_dirent_target(trans, iter, d, &subvol_root,
- target_snapshot);
+ ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
goto err;
} else {
@@ -1776,12 +2162,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
- }
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, equiv.snapshot, i)
- i->count++;
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+ i->count++;
+ }
out:
err:
fsck_err:
@@ -1810,7 +2195,8 @@ int bch2_check_dirents(struct bch_fs *c)
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ check_subdir_count_notnested(trans, &dir));
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
@@ -1829,10 +2215,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
int ret;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
+ if (ret < 0)
return ret;
+ if (ret)
+ return 0;
- i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+ i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -1890,17 +2278,21 @@ static int check_root_trans(struct btree_trans *trans)
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
"root subvol missing")) {
- struct bkey_i_subvolume root_subvol;
+ struct bkey_i_subvolume *root_subvol =
+ bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+ ret = PTR_ERR_OR_ZERO(root_subvol);
+ if (ret)
+ goto err;
snapshot = U32_MAX;
inum = BCACHEFS_ROOT_INO;
- bkey_subvolume_init(&root_subvol.k_i);
- root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol.v.flags = 0;
- root_subvol.v.snapshot = cpu_to_le32(snapshot);
- root_subvol.v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+ bkey_subvolume_init(&root_subvol->k_i);
+ root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol->v.flags = 0;
+ root_subvol->v.snapshot = cpu_to_le32(snapshot);
+ root_subvol->v.inode = cpu_to_le64(inum);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
@@ -1919,7 +2311,7 @@ static int check_root_trans(struct btree_trans *trans)
0, NULL);
root_inode.bi_inum = inum;
- ret = __write_inode(trans, &root_inode, snapshot);
+ ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
bch_err_msg(c, ret, "writing root inode");
}
err:
@@ -1936,6 +2328,107 @@ int bch2_check_root(struct bch_fs *c)
return ret;
}
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+ darray_for_each(*d, i)
+ if (*i == v)
+ return true;
+ return false;
+}
+
+/*
+ * We've checked that inode backpointers point to valid dirents; here, it's
+ * sufficient to check that the subvolume root has a dirent:
+ */
+static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &inode);
+ if (ret)
+ return ret;
+
+ return inode.bi_dir != 0;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter parent_iter = {};
+ darray_u32 subvol_path = {};
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+ ret = darray_push(&subvol_path, k.k->p.offset);
+ if (ret)
+ goto err;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ ret = subvol_has_dirent(trans, s);
+ if (ret < 0)
+ break;
+
+ if (fsck_err_on(!ret,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+ if (darray_u32_has(&subvol_path, parent)) {
+ if (fsck_err(c, subvol_loop, "subvolume loop"))
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &parent_iter);
+ bch2_trans_iter_init(trans, &parent_iter,
+ BTREE_ID_subvolumes, POS(0, parent), 0);
+ k = bch2_btree_iter_peek_slot(&parent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+ c, subvol_unreachable,
+ "unreachable subvolume %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ buf.buf))) {
+ ret = reattach_subvol(trans, s);
+ break;
+ }
+ }
+fsck_err:
+err:
+ printbuf_exit(&buf);
+ darray_exit(&subvol_path);
+ bch2_trans_iter_exit(trans, &parent_iter);
+ return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_path(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
struct pathbuf_entry {
u64 inum;
u32 snapshot;
@@ -1952,89 +2445,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct bch_fs *c, pathbuf *p,
- u64 inum, u32 snapshot)
-{
- int ret = darray_push(p, ((struct pathbuf_entry) {
- .inum = inum,
- .snapshot = snapshot,
- }));
-
- if (ret)
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- p->size);
- return ret;
-}
-
/*
- * Check that a given inode is reachable from the root:
+ * Check that a given inode is reachable from its subvolume root - we already
+ * verified subvolume connectivity:
*
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
-static int check_path(struct btree_trans *trans,
- pathbuf *p,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
+ struct btree_iter inode_iter = {};
+ struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
+ u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
int ret = 0;
- snapshot = bch2_snapshot_equiv(c, snapshot);
p->nr = 0;
- while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+ while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
u32 parent_snapshot = snapshot;
- if (inode->bi_subvol) {
- u64 inum;
-
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &parent_snapshot, &inum);
- if (ret)
- break;
- }
-
- d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot));
+ d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, inode)) {
+ if (!ret && !dirent_points_to_inode(d, &inode)) {
bch2_trans_iter_exit(trans, &dirent_iter);
ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
}
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, inode_unreachable,
- "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
- inode->bi_inum, snapshot,
- bch2_d_type_str(inode_d_type(inode)),
- inode->bi_nlink,
- inode->bi_dir,
- inode->bi_dir_offset))
- ret = reattach_inode(trans, inode, snapshot);
- break;
+ ret = 0;
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, inode_k),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode, snapshot);
+ goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode->bi_mode))
+ if (!S_ISDIR(inode.bi_mode))
break;
- ret = path_down(c, p, inode->bi_inum, snapshot);
- if (ret) {
- bch_err(c, "memory allocation failure");
+ ret = darray_push(p, ((struct pathbuf_entry) {
+ .inum = inode.bi_inum,
+ .snapshot = snapshot,
+ }));
+ if (ret)
return ret;
- }
snapshot = parent_snapshot;
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inode.bi_dir, snapshot), 0);
+ ret = bkey_err(inode_k) ?:
+ !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+ : bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2042,30 +2517,32 @@ static int check_path(struct btree_trans *trans,
break;
}
- if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ snapshot = inode_k.k->p.snapshot;
+
+ if (path_is_dup(p, inode.bi_inum, snapshot)) {
/* XXX print path */
bch_err(c, "directory structure loop");
darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode->bi_inum, snapshot);
-
- if (!fsck_err(c, dir_loop, "directory structure loop"))
- return 0;
+ pr_err("%llu:%u", inode.bi_inum, snapshot);
- ret = remove_backpointer(trans, inode);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ if (fsck_err(c, dir_loop, "directory structure loop")) {
+ ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
- if (ret)
- break;
+ if (ret)
+ break;
- ret = reattach_inode(trans, inode, snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ ret = reattach_inode(trans, &inode, snapshot);
+ bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+ }
break;
}
}
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
}
@@ -2077,7 +2554,6 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
@@ -2090,12 +2566,10 @@ int bch2_check_directory_structure(struct bch_fs *c)
if (!bkey_is_inode(k.k))
continue;
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (u.bi_flags & BCH_INODE_unlinked)
+ if (bch2_inode_flags(k) & BCH_INODE_unlinked)
continue;
- check_path(trans, &path, &u, iter.pos.snapshot);
+ check_path(trans, &path, k);
})));
darray_exit(&path);
@@ -2291,7 +2765,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __write_inode(trans, &u, k.k->p.snapshot);
+ ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
}
fsck_err:
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf2..a4ef942717 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b0..0f95d7fb5e 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
return bch2_inode_unpack_slowpath(k, unpacked);
}
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct bkey_inode_buf *inode_p =
+ bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ inode_p->inode.k.p.snapshot = snapshot;
+
+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &inode_p->inode.k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_fsck_write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
+ return ret;
+}
+
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;
@@ -524,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
prt_newline(out);
- prt_newline(out);
prt_printf(out, "bi_version=%llu", inode->bi_version);
+ prt_newline(out);
#define x(_name, _bits) \
prt_printf(out, #_name "=%llu", (u64) inode->_name); \
@@ -578,7 +606,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bkey_s new,
unsigned flags)
{
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
if (nr) {
@@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
bool old_deleted = bkey_is_deleted_inode(old);
bool new_deleted = bkey_is_deleted_inode(new.s_c);
if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, new_deleted);
if (ret)
return ret;
}
@@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
- if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+ ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+ if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ c, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1141,7 +1171,7 @@ fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
@@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
+ /*
+ * if we ran check_inodes() unlinked inodes will have already been
+ * cleaned up but the write buffer will be out of sync; therefore we
+ * alway need a write buffer flush
+ */
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
need_another_pass = false;
/*
@@ -1183,12 +1222,8 @@ again:
ret;
}));
- if (!ret && need_another_pass) {
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
+ if (!ret && need_another_pass)
goto again;
- }
err:
bch2_trans_put(trans);
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581..0562980505 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
}
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+ case KEY_TYPE_inode_v2:
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+ default:
+ return 0;
+ }
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594c..82f9170dab 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
ret = 0;
err:
bch2_logged_op_finish(trans, op_k);
+ bch_err_fn(c, ret);
return ret;
}
@@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish:
break;
}
err:
+ bch_err_fn(c, ret);
bch2_logged_op_finish(trans, op_k);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873..8a556e6d1a 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017..40d7df7607 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+ __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@@ -199,9 +199,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
u64 new_i_size,
s64 i_sectors_delta)
{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
@@ -214,25 +211,36 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
* lost, but that's fine.
*/
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
+ int ret = bkey_err(k);
if (unlikely(ret))
return ret;
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
+ /*
+ * varint_decode_fast(), in the inode .invalid method, reads up to 7
+ * bytes past the end of the buffer:
+ */
+ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
+ ret = PTR_ERR_OR_ZERO(k_mut);
+ if (unlikely(ret))
+ goto err;
+
+ bkey_reassemble(k_mut, k);
+
+ if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
+ k_mut = bch2_inode_to_v3(trans, k_mut);
+ ret = PTR_ERR_OR_ZERO(k_mut);
if (unlikely(ret))
goto err;
}
- inode = bkey_i_to_inode_v3(k);
+ struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
@@ -530,7 +538,8 @@ static void __bch2_write_index(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1067,7 +1076,8 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+ bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1169,7 +1179,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
- "write error while doing btree update: %s",
+ "%s write error while doing btree update: %s",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@@ -1449,7 +1460,9 @@ err:
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "%s(): error: %s", __func__, bch2_err_str(ret));
+ "%s(): %s error: %s", __func__,
+ op->flags & BCH_WRITE_MOVE ? "move" : "user",
+ bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1500,6 +1513,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
+ memset(&op->failed, 0, sizeof(op->failed));
+
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
op->flags |= BCH_WRITE_DONE;
@@ -1573,7 +1588,8 @@ CLOSURE_CALLBACK(bch2_write)
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
- "misaligned write");
+ "%s write error: misaligned write",
+ op->flags & BCH_WRITE_MOVE ? "move" : "user");
op->error = -EIO;
goto err;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb..a8b08e76d0 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+ return __journal_entry_is_open(j->reservations);
+}
+
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
- prt_printf(out, "seq:");
+ prt_str(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:");
+ prt_str(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
- prt_printf(out, "size:");
+ prt_str(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
- prt_printf(out, "expires");
+ prt_str(out, "expires:");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
+ prt_str(out, "flags:");
+ prt_tab(out);
+ if (buf->noflush)
+ prt_str(out, "noflush ");
+ if (buf->must_flush)
+ prt_str(out, "must_flush ");
+ if (buf->separate_flush)
+ prt_str(out, "separate_flush ");
+ if (buf->need_flush_to_write_buffer)
+ prt_str(out, "need_flush_to_write_buffer ");
+ if (buf->write_started)
+ prt_str(out, "write_started ");
+ if (buf->write_allocated)
+ prt_str(out, "write allocated ");
+ if (buf->write_done)
+ prt_str(out, "write done");
+ prt_newline(out);
+
printbuf_indent_sub(out, 2);
}
@@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
+ prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
@@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
+void bch2_journal_do_writes(struct journal *j)
+{
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *w = j->buf + idx;
+
+ if (w->write_started && !w->write_allocated)
+ break;
+ if (w->write_started)
+ continue;
+
+ if (!journal_state_count(j->reservations, idx)) {
+ w->write_started = true;
+ closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+ }
+
+ break;
+ }
+}
+
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
- if (write)
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ bch2_journal_do_writes(j);
}
/*
@@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
+ buf->write_started = false;
+ buf->write_allocated = false;
+ buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- mod_delayed_work(c->io_complete_wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
+ if (nr_unwritten_journal_entries(j) == 1)
+ mod_delayed_work(j->wq,
+ &j->write_work,
+ msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- long delta;
spin_lock(&j->lock);
- if (!__journal_entry_is_open(j->reservations))
- goto unlock;
-
- delta = journal_cur_buf(j)->expires - jiffies;
+ if (__journal_entry_is_open(j->reservations)) {
+ long delta = journal_cur_buf(j)->expires - jiffies;
- if (delta > 0)
- mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+ if (delta > 0)
+ mod_delayed_work(j->wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+ }
spin_unlock(&j->lock);
}
@@ -476,30 +514,29 @@ retry:
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
- spin_lock(&j->lock);
+ if (j->blocked)
+ return -BCH_ERR_journal_res_get_blocked;
- /* check once more in case somebody else shut things down... */
- if (bch2_journal_error(j)) {
- spin_unlock(&j->lock);
- return -BCH_ERR_erofs_journal_err;
+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+ ret = JOURNAL_ERR_journal_full;
+ can_discard = j->can_discard;
+ goto out;
+ }
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+ ret = JOURNAL_ERR_max_in_flight;
+ goto out;
}
+ spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
- spin_unlock(&j->lock);
- return 0;
- }
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- /*
- * Don't want to close current journal entry, just need to
- * invoke reclaim:
- */
- ret = JOURNAL_ERR_journal_full;
+ ret = 0;
goto unlock;
}
@@ -515,30 +552,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j);
-
- if (ret == JOURNAL_ERR_max_in_flight) {
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, true);
- if (trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- bch2_journal_bufs_to_text(&buf, j);
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- }
- count_event(c, journal_entry_full);
- }
+ ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
-
- if (!ret)
+out:
+ if (ret == JOURNAL_ERR_retry)
goto retry;
+ if (!ret)
+ return 0;
+
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
+ if (ret == JOURNAL_ERR_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@@ -669,12 +706,18 @@ recheck_need_open:
spin_unlock(&j->lock);
+ /*
+ * We're called from bch2_journal_flush_seq() -> wait_event();
+ * but this might block. We won't usually block, so we won't
+ * livelock:
+ */
+ sched_annotate_sleep();
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
seq = res.seq;
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@@ -692,8 +735,8 @@ recheck_need_open:
}
/*
- * if write was kicked off without a flush, flush the next sequence
- * number instead
+ * if write was kicked off without a flush, or if we promised it
+ * wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@@ -771,8 +814,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
- /* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+ /* journal flush already in flight, or flush requseted */
+ if (buf->must_flush)
goto out;
buf->noflush = true;
@@ -833,6 +876,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
{
struct journal_buf *ret = NULL;
+ /* We're inside wait_event(), but using mutex_lock(: */
+ sched_annotate_sleep();
mutex_lock(&j->buf_lock);
spin_lock(&j->lock);
max_seq = min(max_seq, journal_cur_seq(j));
@@ -1157,13 +1202,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@@ -1196,7 +1240,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1211,8 +1255,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr)
+ bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
@@ -1240,13 +1284,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
- kfree(ca->journal.bio);
- kfree(ca->journal.buckets);
- kfree(ca->journal.bucket_seq);
+ struct journal_device *ja = &ca->journal;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ kfree(ja->bio[i]);
+ ja->bio[i] = NULL;
+ }
- ca->journal.bio = NULL;
- ca->journal.buckets = NULL;
- ca->journal.bucket_seq = NULL;
+ kfree(ja->buckets);
+ kfree(ja->bucket_seq);
+ ja->buckets = NULL;
+ ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1304,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
- unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1320,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
- nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+ unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
- ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!ca->journal.bio)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+ ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ nr_bvecs), GFP_KERNEL);
+ if (!ja->bio[i])
+ return -BCH_ERR_ENOMEM_dev_journal_init;
- bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+ ja->bio[i]->ca = ca;
+ ja->bio[i]->buf_idx = i;
+ bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+ }
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@@ -1287,14 +1339,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned j, dst = 0;
+ unsigned dst = 0;
- for (i = 0; i < nr; i++)
- for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ for (unsigned i = 0; i < nr; i++)
+ for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
- for (i = 0; i < ja->nr; i++)
+ for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@@ -1303,19 +1355,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- unsigned i;
+ if (j->wq)
+ destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
- for (i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvpfree(j->buf[i].data, j->buf[i].buf_size);
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
- unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@@ -1336,14 +1388,20 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
- for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
+ j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
+
+ j->wq = alloc_workqueue("bcachefs_journal",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+ if (!j->wq)
+ return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
@@ -1381,6 +1439,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ prt_printf(out, "blocked:\t\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
@@ -1455,7 +1514,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@@ -1473,7 +1531,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
- for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list) {
prt_printf(out, "\t%px %ps", pin, pin->flush);
prt_newline(out);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb..7c7528f839 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
}
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f1..eb1f9d6f5a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,37 @@
#include "sb-clean.h"
#include "trace.h"
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ darray_for_each(j->ptrs, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+ u64 offset;
+
+ div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+ if (i != j->ptrs.data)
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
+ i->dev, i->bucket, i->bucket_offset, i->sector);
+ }
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+ bch2_journal_ptrs_to_text(out, c, j);
+
+ for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+ break;
+ }
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ kvfree(i);
}
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
- i->ignore = true;
+ if (blacklisted)
+ i->ignore_blacklisted = true;
+ else
+ i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
- struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
/* Is this entry older than the range we need? */
@@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
- journal_replay_free(c, i);
+
+ journal_replay_free(c, i, false);
}
}
@@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
- if (bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes)) {
- i = dup;
- goto found;
- }
+ bool identical = bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes);
+ bool not_identical = !identical &&
+ entry_ptr.csum_good &&
+ dup->csum_good;
+
+ bool same_device = false;
+ darray_for_each(dup->ptrs, ptr)
+ if (ptr->dev == ca->dev_idx)
+ same_device = true;
+
+ ret = darray_push(&dup->ptrs, entry_ptr);
+ if (ret)
+ goto out;
- if (!entry_ptr.csum_good) {
- i = dup;
- goto found;
- }
+ bch2_journal_replay_to_text(&buf, c, dup);
+
+ fsck_err_on(same_device,
+ c, journal_entry_dup_same_device,
+ "duplicate journal entry on same device\n %s",
+ buf.buf);
- if (!dup->csum_good)
+ fsck_err_on(not_identical,
+ c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries\n %s",
+ buf.buf);
+
+ if (entry_ptr.csum_good && !identical)
goto replace;
- fsck_err(c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
- i = dup;
- goto found;
+ goto out;
}
replace:
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
- i->nr_ptrs = 0;
- i->csum_good = entry_ptr.csum_good;
- i->ignore = false;
+ darray_init(&i->ptrs);
+ i->csum_good = entry_ptr.csum_good;
+ i->ignore_blacklisted = false;
+ i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
- i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
- }
-
/* The first ptr should represent the jset we kept: */
- memcpy(i->ptrs + i->nr_ptrs,
- dup->ptrs,
- sizeof(dup->ptrs[0]) * dup->nr_ptrs);
- i->nr_ptrs += dup->nr_ptrs;
+ darray_for_each(dup->ptrs, ptr)
+ darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
+ } else {
+ darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
- return 0;
-found:
- for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
- if (ptr->dev == ca->dev_idx) {
- bch_err(c, "duplicate journal entry %llu on same device",
- le64_to_cpu(i->j.seq));
- goto out;
- }
- }
-
- if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
- bch_err(c, "found too many copies of journal entry %llu",
- le64_to_cpu(i->j.seq));
- goto out;
- }
-
- i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -223,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out,
if (entry) {
prt_str(out, " type=");
- prt_str(out, bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
}
if (!jset) {
@@ -374,13 +398,13 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
- struct bkey_i *k;
bool first = true;
jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_prt_jset_entry_type(out, entry->type);
+ prt_str(out, ": ");
}
prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -540,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- prt_printf(out, "type=%s v=%llu",
- bch2_fs_usage_types[u->entry.btree_id],
- le64_to_cpu(u->v));
+ prt_str(out, "type=");
+ bch2_prt_fs_usage_type(out, u->entry.btree_id);
+ prt_printf(out, " v=%llu", le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
@@ -741,6 +765,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_datetime_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ unsigned bytes = vstruct_bytes(entry);
+ unsigned expected = 16;
+ int ret = 0;
+
+ if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+fsck_err:
+ return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_datetime *datetime =
+ container_of(entry, struct jset_entry_datetime, entry);
+
+ bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -773,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
+ bch2_prt_jset_entry_type(out, entry->type);
+
if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_str(out, ": ");
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- } else {
- prt_printf(out, "(unknown type %u)", entry->type);
}
}
@@ -913,11 +968,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
+ n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
- kvpfree(b->data, b->size);
+ kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@@ -1028,9 +1083,7 @@ reread:
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %s",
- bch2_err_str(ret));
+ bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1102,16 +1155,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!r)
continue;
- for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ darray_for_each(r->ptrs, i)
+ if (i->dev == ca->dev_idx) {
+ unsigned wrote = bucket_remainder(ca, i->sector) +
vstruct_sectors(&r->j, c->block_bits);
- ja->cur_idx = r->ptrs[i].bucket;
+ ja->cur_idx = i->bucket;
ja->sectors_free = ca->mi.bucket_size - wrote;
goto found;
}
- }
}
found:
mutex_unlock(&jlist->lock);
@@ -1144,7 +1196,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvpfree(buf.data, buf.size);
+ kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@@ -1155,27 +1207,6 @@ err:
goto out;
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- unsigned i;
-
- for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
- u64 offset;
-
- div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
- if (i)
- prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- j->ptrs[i].dev,
- j->ptrs[i].bucket,
- j->ptrs[i].bucket_offset,
- j->ptrs[i].sector);
- }
-}
-
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@@ -1228,20 +1259,20 @@ int bch2_journal_read(struct bch_fs *c,
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
- i->ignore = true;
+ i->ignore_blacklisted = true;
continue;
}
@@ -1280,12 +1311,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
- journal_replay_free(c, i);
+ journal_replay_free(c, i, false);
continue;
}
@@ -1293,7 +1324,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
- i->ignore = true;
+ i->ignore_blacklisted = true;
}
}
@@ -1302,7 +1333,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1353,32 +1384,31 @@ int bch2_journal_read(struct bch_fs *c,
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
- unsigned ptr;
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ darray_for_each(i->ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!i->ptrs[ptr].csum_good)
- bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ if (!ptr->csum_good)
+ bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
- bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
&i->j,
- i->ptrs[0].sector,
+ i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
- for (ptr = 0; ptr < i->nr_ptrs; ptr++)
- replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ darray_for_each(i->ptrs, ptr)
+ replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
bch2_replicas_entry_sort(&replicas.e);
@@ -1547,7 +1577,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
- new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+ new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1558,7 +1588,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
- kvpfree(new_buf, new_size);
+ kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1568,12 +1598,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 v, seq;
+ u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1593,63 +1623,68 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
- spin_lock(&j->lock);
- seq = le64_to_cpu(w->data->seq);
+ closure_debug_destroy(cl);
+ spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+ w->write_done = true;
+
+ bool completed = false;
- if (!err) {
- if (!JSET_NO_FLUSH(w->data)) {
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ w = j->buf + (seq & JOURNAL_BUF_MASK);
+ if (!w->write_done)
+ break;
+
+ if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
-
bch2_reset_alloc_cursors(c);
}
- } else if (!j->err_seq || seq < j->err_seq)
- j->err_seq = seq;
- j->seq_ondisk = seq;
+ j->seq_ondisk = seq;
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
+ /*
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+ * more buckets:
+ *
+ * Must come before signaling write completion, for
+ * bch2_fs_journal_stop():
+ */
+ if (j->watermark != BCH_WATERMARK_stripe)
+ journal_reclaim_kick(&c->journal);
- /* also must come before signalling write completion: */
- closure_debug_destroy(cl);
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
+ BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
- v = atomic64_read(&j->reservations.counter);
- do {
- old.v = new.v = v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
- new.unwritten_idx++;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ closure_wake_up(&w->wait);
+ completed = true;
+ }
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
+ if (completed) {
+ bch2_journal_reclaim_fast(j);
+ bch2_journal_space_available(j);
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, false);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
- closure_wake_up(&w->wait);
- journal_wake(j);
+ journal_wake(j);
+ }
- if (!journal_state_count(new, new.unwritten_idx) &&
- journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
- spin_unlock(&j->lock);
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
- } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1659,46 +1694,46 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
-
- spin_unlock(&j->lock);
- mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
- } else {
- spin_unlock(&j->lock);
+ mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
- struct bch_dev *ca = bio->bi_private;
+ struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+ struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- unsigned long flags;
+ struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
+ unsigned long flags;
+
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
- closure_put(&j->io);
+ closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
-static CLOSURE_CALLBACK(do_journal_write)
+static CLOSURE_CALLBACK(journal_write_submit)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct journal_device *ja = &ca->journal;
+
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@@ -1708,7 +1743,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
- bio = ca->journal.bio;
+ struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1727,11 +1762,48 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] =
- le64_to_cpu(w->data->seq);
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
+}
+
+static CLOSURE_CALLBACK(journal_write_preflush)
+{
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ spin_lock(&j->lock);
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
+
+ if (w->separate_flush) {
+ for_each_rw_member(c, ca) {
+ percpu_ref_get(&ca->io_ref);
+
+ struct journal_device *ja = &ca->journal;
+ struct bio *bio = &ja->bio[w->idx]->bio;
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+
+ continue_at(cl, journal_write_submit, j->wq);
+ } else {
+ /*
+ * no need to punt to another work item if we're not waiting on
+ * preflushes
+ */
+ journal_write_submit(&cl->work);
+ }
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1782,11 +1854,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
- struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
- bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
bch2_journal_keys_to_write_buffer_end(c, &wb);
return ret;
}
@@ -1798,15 +1870,24 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (wb.wb)
bch2_journal_keys_to_write_buffer_end(c, &wb);
+
+ spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
+ spin_unlock(&c->journal.lock);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+ struct jset_entry_datetime *d =
+ container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+ d->entry.type = BCH_JSET_ENTRY_datetime;
+ d->seconds = cpu_to_le64(ktime_get_real_seconds());
+
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
- BUG_ON(u64s > j->entry_u64s_reserved);
+
+ WARN_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&jset->u64s, u64s);
@@ -1814,7 +1895,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
bytes = vstruct_bytes(jset);
if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
return -EINVAL;
@@ -1842,8 +1923,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret))
+ if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1893,6 +1973,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
+ w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
@@ -1903,20 +1984,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
- closure_type(j, struct journal, io);
+ closure_type(w, struct journal_buf, io);
+ struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
- struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
+ for_each_rw_member(c, ca)
+ nr_rw_members++;
+
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+ BUG_ON(!w->write_started);
+ BUG_ON(w->write_allocated);
+ BUG_ON(w->write_done);
j->write_start_time = local_clock();
spin_lock(&j->lock);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
+
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
@@ -1956,12 +2045,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
* bch2_journal_space_available():
*/
w->sectors = 0;
+ w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1969,12 +2060,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -1985,25 +2070,15 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
- if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
-
- continue_at(cl, do_journal_write, c->io_complete_wq);
+ if (!JSET_NO_FLUSH(w->data))
+ continue_at(cl, journal_write_preflush, j->wq);
+ else
+ continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
- continue_at(cl, journal_write_done, c->io_complete_wq);
+ continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108..4f1e763ab5 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,26 +2,35 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
+#include "darray.h"
+
+struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
- struct journal_ptr {
- bool csum_good;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
- } ptrs[BCH_REPLICAS_MAX];
- unsigned nr_ptrs;
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
- bool ignore;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+ return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
@@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
}
#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
+ for (struct jset_entry *entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define jset_entry_for_each_key(_e, _k) \
- for (_k = (_e)->start; \
+ for (struct bkey_i *_k = (_e)->start; \
_k < vstruct_last(_e); \
_k = bkey_next(_k))
@@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+ memset(entry, 0, u64s * sizeof(u64));
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = cpu_to_le16(u64s - 1);
+
+ *end = vstruct_next(*end);
+ return entry;
+}
+
#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c33dca6415..04a577848b 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,14 +62,13 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
- &j->low_on_space_start, low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
- &j->write_buffer_full_start, low_on_wb))
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
@@ -394,8 +393,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@@ -411,44 +408,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
- reclaim = __journal_pin_drop(j, dst);
+ bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+ spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- bool reclaim;
-
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
- reclaim = __journal_pin_drop(j, pin);
+ bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
- journal_wake(j);
+ if (seq == journal_last_seq(j))
+ journal_wake(j);
+
+ spin_unlock(&j->lock);
}
/**
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cf..37a024e034 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
- struct bch_sb_field_journal_seq_blacklist *bl,
- unsigned i)
-{
- unsigned nr = blacklist_nr_entries(bl);
-
- if (le64_to_cpu(bl->start[i].end) >=
- le64_to_cpu(bl->start[i + 1].start)) {
- bl->start[i].end = bl->start[i + 1].end;
- --nr;
- memmove(&bl->start[i],
- &bl->start[i + 1],
- sizeof(bl->start[0]) * (nr - i));
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr));
- BUG_ON(!bl);
- }
-
- return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
- u64 start, u64 end)
-{
- return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i, nr;
+ unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
- for (i = 0; i < nr; i++) {
+ while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
- if (bl_entry_contig_or_overlaps(e, start, end)) {
- e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
- e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
+ if (end < le64_to_cpu(e->start))
+ break;
+
+ if (start > le64_to_cpu(e->end)) {
+ i++;
+ continue;
}
+
+ /*
+ * Entry is contiguous or overlapping with new entry: merge it
+ * with new entry, and delete:
+ */
+
+ start = min(start, le64_to_cpu(e->start));
+ end = max(end, le64_to_cpu(e->end));
+ array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
- bl->start[nr].start = cpu_to_le64(start);
- bl->start[nr].end = cpu_to_le64(end);
-out_write_sb:
+ array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+ .start = cpu_to_le64(start),
+ .end = cpu_to_le64(end),
+ }));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@@ -119,8 +95,7 @@ out:
return ret ?: bch2_blacklist_table_initialize(c);
}
-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
@@ -165,8 +140,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
- t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
- GFP_KERNEL);
+ t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a08..b5161b5d76 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
+ struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
- bool noflush; /* write has already been kicked off, and was noflush */
- bool must_flush; /* something wants a flush */
- bool separate_flush;
- bool need_flush_to_write_buffer;
+ bool noflush:1; /* write has already been kicked off, and was noflush */
+ bool must_flush:1; /* something wants a flush */
+ bool separate_flush:1;
+ bool need_flush_to_write_buffer:1;
+ bool write_started:1;
+ bool write_allocated:1;
+ bool write_done:1;
+ u8 idx;
};
/*
@@ -129,11 +134,13 @@ enum journal_flags {
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
JOURNAL_NEED_FLUSH_WRITE,
+ JOURNAL_SPACE_LOW,
};
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
+ x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@@ -149,6 +156,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
+struct journal_bio {
+ struct bch_dev *ca;
+ unsigned buf_idx;
+
+ struct bio bio;
+};
+
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@@ -203,8 +217,8 @@ struct journal {
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure io;
struct delayed_work write_work;
+ struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@@ -274,11 +288,6 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
- u64 low_on_space_start;
- u64 low_on_pin_start;
- u64 max_in_flight_start;
- u64 write_buffer_full_start;
-
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
@@ -313,7 +322,7 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
- struct bio *bio;
+ struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad598105c5..b82f820904 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
- int ret;
if (!fn)
return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
bch2_bkey_buf_init(&sk);
bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
- fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+ fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+
+ return trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
@@ -101,8 +100,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
- __func__, buf.buf, bch2_err_str(ret));
+ bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+ buf.buf, bch2_err_str(ret));
printbuf_exit(&buf);
}
}
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b..26569043e3 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set)
{
return time
- ? bch2_btree_bit_mod(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+ lru_pos(lru_id, dev_bucket, time), set)
: 0;
}
@@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
- if (c->opts.reconstruct_alloc ||
- fsck_err(c, lru_entry_bad,
+ if (fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index bf0ef668fd..0ea9f30803 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
* @s: mean and variance number of samples and their sums
* @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 x, bool initted, u8 weight)
{
// previous weighted variance.
- u8 w = s->weight;
+ u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
- if (!s->init) {
+ if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
- s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return fast_divpow2(s.mean, s.weight);
+ return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight)
{
// always positive don't need fast divpow2
- return s.variance >> s.weight;
+ return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
*/
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight)
{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+ return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 64df11ab42..4fcf062dd2 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
- bool init;
- u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+ s64 v, bool initted, u8 weight);
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+ u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+ u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+ u8 weight);
#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 51093fa848..4c298e7472 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
static void mean_and_variance_weighted_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 2 };
+ struct mean_and_variance_weighted s = { };
- mean_and_variance_weighted_update(&s, 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, 10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, 20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, 20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, 30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, 30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
- s = (struct mean_and_variance_weighted) { .weight = 2 };
+ s = (struct mean_and_variance_weighted) { };
- mean_and_variance_weighted_update(&s, -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+ mean_and_variance_weighted_update(&s, -10, false, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
- mean_and_variance_weighted_update(&s, -20);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+ mean_and_variance_weighted_update(&s, -20, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
- mean_and_variance_weighted_update(&s, -30);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+ mean_and_variance_weighted_update(&s, -30, true, 2);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
}
static void mean_and_variance_weighted_advanced_test(struct kunit *test)
{
- struct mean_and_variance_weighted s = { .weight = 8 };
+ struct mean_and_variance_weighted s = { };
+ bool initted = false;
s64 i;
- for (i = 10; i <= 100; i += 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = 10; i <= 100; i += 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
- s = (struct mean_and_variance_weighted) { .weight = 8 };
+ s = (struct mean_and_variance_weighted) { };
+ initted = false;
- for (i = -10; i >= -100; i -= 10)
- mean_and_variance_weighted_update(&s, i);
+ for (i = -10; i >= -100; i -= 10) {
+ mean_and_variance_weighted_update(&s, i, initted, 8);
+ initted = true;
+ }
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
}
static void do_mean_and_variance_test(struct kunit *test,
@@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
s64 *weighted_stddev)
{
struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { .weight = weight };
+ struct mean_and_variance_weighted vw = { };
for (unsigned i = 0; i < initial_n; i++) {
mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value);
+ mean_and_variance_weighted_update(&vw, initial_value, false, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
}
for (unsigned i = 0; i < n; i++) {
mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i]);
+ mean_and_variance_weighted_update(&vw, data[i], true, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
}
KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef..69098eeb5d 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -EINVAL;
+ return -BCH_ERR_remove_would_lose_data;
return 0;
}
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -EINVAL;
+ return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
- if (ret) {
- bch_err(c, "Cannot drop device without losing data");
+ if (ret)
break;
- }
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bf68ea4944..4d94b7742d 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -968,24 +968,30 @@ static bool migrate_btree_pred(struct bch_fs *c, void *arg,
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+/*
+ * Ancient versions of bcachefs produced packed formats which could represent
+ * keys that the in memory format cannot represent; this checks for those
+ * formats so we can get rid of them.
+ */
static bool bformat_needs_redo(struct bkey_format *f)
{
- unsigned i;
-
- for (i = 0; i < f->nr_fields; i++) {
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ unsigned f_bits = f->bits_per_field[i];
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (f->bits_per_field[i] > unpacked_bits)
+ if (f_bits > unpacked_bits)
return true;
- if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ if ((f_bits == unpacked_bits) && field_offset)
return true;
- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
- unpacked_mask) <
- field_offset)
+ u64 f_mask = f_bits
+ ? ~((~0ULL << (f_bits - 1)) << 1)
+ : 0;
+
+ if (((field_offset + f_mask) & unpacked_mask) < field_offset)
return true;
}
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a84da..0d2b82d8d1 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_err_matches(ret, EROFS))
return ret;
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
- __func__, bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index b1ed0b9a20..bb068fd724 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,6 +7,7 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -42,7 +43,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -52,7 +53,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -82,18 +83,39 @@ const char * const bch2_member_states[] = {
NULL
};
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
BCH_JSET_ENTRY_TYPES()
NULL
};
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
BCH_FS_USAGE_TYPES()
NULL
};
#undef x
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+ unsigned nr, const char *type, unsigned idx)
+{
+ if (idx < nr)
+ prt_str(out, opts[idx]);
+ else
+ prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
+void bch2_prt_##name(struct printbuf *out, type t) \
+{ \
+ prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
{
@@ -205,6 +227,9 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = ARRAY_SIZE(_choices), \
.choices = _choices
+#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = U64_MAX, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
@@ -314,7 +339,7 @@ int bch2_opt_parse(struct bch_fs *c,
if (ret < 0 || (*res != 0 && *res != 1)) {
if (err)
prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
+ return ret < 0 ? ret : -BCH_ERR_option_not_bool;
}
break;
case BCH_OPT_UINT:
@@ -456,7 +481,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
- return -1;
+ return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
@@ -501,11 +526,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
bad_opt:
pr_err("Bad mount option %s", name);
- ret = -1;
+ ret = -BCH_ERR_option_name;
goto out;
bad_val:
pr_err("Invalid mount option %s", err.buf);
- ret = -1;
+ ret = -BCH_ERR_option_value;
goto out;
out:
kfree(copied_opts_start);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa37..84e452835a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+
static inline const char *bch2_d_type_str(unsigned d_type)
{
return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
@@ -290,6 +292,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
+ x(no_splitbrain_check, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
@@ -332,6 +339,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
+ x(fsck_memory_usage_percent, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(20, 70), \
+ BCH2_NO_SB_OPT, 50, \
+ NULL, "Maximum percentage of system ram fsck is allowed to pin")\
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_FN(bch2_opt_fix_errors), \
@@ -352,12 +364,17 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't replay the journal") \
- x(keep_journal, u8, \
+ NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_pass_last, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_STR_NOLIMIT(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Exit recovery after specified pass") \
+ x(retain_recovery_info, u8, \
0, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys after startup")\
+ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index e68b34eab9..556da07381 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -560,13 +560,11 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_snapshot_tree s_t;
- int ret;
+ u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+ int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__,
- snapshot_t(c, k.k->p.snapshot)->tree);
+ "%s: snapshot tree %u not found", __func__, tree);
if (ret)
return ret;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa4..56336f3dd1 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
- bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb433..8091d06860 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,35 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
#include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
-#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
-#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
-#include "lru.h"
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-downgrade.h"
#include "snapshot.h"
-#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -37,30 +33,67 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static bool btree_id_is_alloc(enum btree_id id)
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
+ u64 b = BIT_ULL(btree);
+
+ if (!(c->sb.btrees_lost_data & b)) {
+ bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+ mutex_lock(&c->sb_lock);
+ bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
}
/* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
{
- size_t src, dst;
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- for (src = 0, dst = 0; src < keys->nr; src++)
- if (!btree_id_is_alloc(keys->d[src].btree_id))
- keys->d[dst++] = keys->d[src];
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- keys->nr = dst;
+
+ bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+ 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
}
/*
@@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for (i = keys->d; i < keys->d + keys->nr; i++)
+ darray_for_each(*keys, i)
if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
@@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ if (unlikely(!btree_path_node(path, k->level))) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, 0, iter_flags);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_increase_depth(trans, iter.path, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
@@ -142,7 +184,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
DARRAY(struct journal_key *) keys_sorted = { 0 };
@@ -150,6 +192,7 @@ static int bch2_journal_replay(struct bch_fs *c)
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
struct btree_trans *trans = bch2_trans_get(c);
+ bool immediate_flush = false;
int ret = 0;
if (keys->nr) {
@@ -161,15 +204,22 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
+ move_gap(keys, keys->nr);
+
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
*/
- for (size_t i = 0; i < keys->nr; i++) {
+ darray_for_each(*keys, k) {
cond_resched();
- struct journal_key *k = keys->d + i;
+ /*
+ * k->allocated means the key wasn't read in from the journal,
+ * rather it was from early repair code
+ */
+ if (k->allocated)
+ immediate_flush = true;
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
@@ -199,7 +249,10 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- replay_now_at(j, k->journal_seq);
+ if (k->journal_seq)
+ replay_now_at(j, k->journal_seq);
+ else
+ replay_now_at(j, j->replay_journal_seq_end);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
@@ -222,7 +275,8 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_trans_put(trans);
trans = NULL;
- if (!c->opts.keep_journal)
+ if (!c->opts.retain_recovery_info &&
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -230,6 +284,12 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
+ /* if we did any repair, flush it immediately */
+ if (immediate_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ ret = bch2_journal_meta(&c->journal);
+ }
+
if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
@@ -264,7 +324,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
- r->error = -EIO;
+ r->error = -BCH_ERR_btree_node_read_error;
}
r->alive = true;
break;
@@ -359,7 +419,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
- if (!i || i->ignore)
+ if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@@ -379,202 +439,57 @@ static int journal_replay_early(struct bch_fs *c,
static int read_btree_roots(struct bch_fs *c)
{
- unsigned i;
int ret = 0;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
continue;
- if (btree_id_is_alloc(i) &&
- c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
- }
- if (r->error) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- btree_root_bkey_invalid,
- "invalid btree root %s",
- bch2_btree_id_str(i));
- if (i == BTREE_ID_alloc)
+ if (mustfix_fsck_err_on((ret = r->error),
+ c, btree_root_bkey_invalid,
+ "invalid btree root %s",
+ bch2_btree_id_str(i)) ||
+ mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+ c, btree_root_read_error,
+ "error reading btree root %s l=%u: %s",
+ bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+ if (btree_id_is_alloc(i)) {
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- }
+ r->error = 0;
+ } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ bch_info(c, "will run btree node scan");
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ }
- ret = bch2_btree_root_read(c, i, &r->key, r->level);
- if (ret) {
- fsck_err(c,
- btree_root_read_error,
- "error reading btree root %s",
- bch2_btree_id_str(i));
- if (btree_id_is_alloc(i))
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
+ bch2_btree_lost_data(c, i);
}
}
- for (i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (!r->b) {
+ if (!r->b && !r->error) {
r->alive = false;
r->level = 0;
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
}
}
fsck_err:
return ret;
}
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (keys->nr || c->opts.fsck || !c->sb.clean)
- return bch2_fs_read_write_early(c);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
-
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
-}
-
static bool check_version_upgrade(struct bch_fs *c)
{
unsigned latest_version = bcachefs_metadata_version_current;
@@ -647,96 +562,6 @@ static bool check_version_upgrade(struct bch_fs *c)
return false;
}
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
- return false;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
- unsigned pass = c->curr_recovery_pass;
-
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
- (ret && c->curr_recovery_pass < pass))
- continue;
- if (ret)
- break;
-
- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
- }
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
- }
-
- return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
-
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- break;
- }
-
- return ret;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
struct bch_sb_field_clean *clean = NULL;
@@ -769,24 +594,14 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (c->opts.fsck && c->opts.norecovery) {
- bch_err(c, "cannot select both norecovery and fsck");
- ret = -EINVAL;
- goto err;
- }
+ if (c->opts.norecovery)
+ c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
- if (!ext) {
- ret = -BCH_ERR_ENOSPC_sb;
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
@@ -845,7 +660,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+ if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
struct genradix_iter iter;
struct journal_replay **i;
@@ -862,7 +677,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i && !(*i)->ignore) {
+ if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@@ -887,7 +702,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
- (*i)->ignore = false;
+ (*i)->ignore_blacklisted = false;
+ (*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@@ -923,10 +739,8 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
- if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- drop_alloc_keys(&c->journal_keys);
- }
+ if (c->opts.reconstruct_alloc)
+ bch2_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@@ -950,7 +764,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
- bch_err(c, "error creating new journal seq blacklist entry");
+ bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@@ -961,9 +775,6 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
- bch2_journal_log_msg(c, "dropping alloc info");
-
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:
@@ -981,6 +792,12 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* fsync if we fixed errors */
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
test_bit(BCH_FS_errors_fixed, &c->flags) &&
@@ -1015,6 +832,7 @@ use_clean:
}
mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
@@ -1028,15 +846,18 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_error, &c->flags)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (ext &&
- (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
- memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
+ if (!test_bit(BCH_FS_error, &c->flags) &&
+ !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+ memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+ write_sb = true;
+ }
+
+ if (c->opts.fsck &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ ext->btrees_lost_data) {
+ ext->btrees_lost_data = 0;
+ write_sb = true;
}
if (c->opts.fsck &&
@@ -1077,10 +898,12 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!c->opts.retain_recovery_info) {
bch2_journal_keys_put_initial(c);
- kfree(clean);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+ }
+ if (!IS_ERR(clean))
+ kfree(clean);
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
@@ -1105,6 +928,7 @@ int bch2_fs_initialize(struct bch_fs *c)
int ret;
bch_notice(c, "initializing new filesystem");
+ set_bit(BCH_FS_new_fs, &c->flags);
mutex_lock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1119,11 +943,11 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc(c, i);
+ bch2_btree_root_alloc_fake(c, i, 0);
for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
@@ -1194,7 +1018,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+ c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b..4bf818de1f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-extern const char * const bch2_recovery_passes[];
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
- return 0;
-
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 0000000000..0cec0f7d97
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+ return bch2_gc(c, true, false);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys, keys->nr);
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ static const u8 map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ };
+
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(map[i]);
+ return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return 0;
+
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (!test_bit_le64(s, ext->recovery_passes_required)) {
+ __set_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+
+ return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (test_bit_le64(s, ext->recovery_passes_required)) {
+ __clear_bit_le64(s, ext->recovery_passes_required);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+ u64 ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+ if (c->recovery_passes_explicit & BIT_ULL(pass))
+ return true;
+ if ((p->when & PASS_FSCK) && c->opts.fsck)
+ return true;
+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+ return true;
+ if (p->when & PASS_ALWAYS)
+ return true;
+ return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ int ret;
+
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
+
+ return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
+ continue;
+ }
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
+
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (c->opts.recovery_pass_last &&
+ c->curr_recovery_pass > c->opts.recovery_pass_last)
+ break;
+
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+ c->curr_recovery_pass++;
+ }
+
+ return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 0000000000..99b464e127
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h
index fa0c8efd2a..773aea9a00 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define PASS_SILENT BIT(0)
#define PASS_FSCK BIT(1)
@@ -13,11 +13,12 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_topology, 4, 0) \
x(check_allocations, 5, PASS_FSCK) \
x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
@@ -31,20 +32,23 @@
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
x(check_extents, 25, PASS_FSCK) \
x(check_indirect_extents, 26, PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
@@ -54,6 +58,7 @@ enum bch_recovery_pass {
#define x(n, id, when) BCH_RECOVERY_PASS_##n,
BCH_RECOVERY_PASSES()
#undef x
+ BCH_RECOVERY_PASS_NR
};
/* But we also need stable identifiers that can be used in the superblock */
@@ -63,4 +68,4 @@ enum bch_recovery_pass_stable {
#undef x
};
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b3..ff7864731a 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,8 +185,7 @@ not_found:
} else {
bkey_error_init(update);
update->k.p = p.k->p;
- update->k.p.offset = next_idx;
- update->k.size = next_idx - *idx;
+ update->k.size = p.k->size;
set_bkey_val_u64s(&update->k, 0);
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c120..678b9c20e2 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
verify_replicas_entry(search);
-#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
entry_cmp, search);
#undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);
for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e..194e55b111 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -29,6 +29,14 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if (vstruct_end(entry) > vstruct_end(&clean->field)) {
+ bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
+ le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
+ (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
+ bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
+ return -BCH_ERR_fsck_repair_unimplemented;
+ }
+
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
@@ -171,22 +179,6 @@ fsck_err:
return ERR_PTR(ret);
}
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
@@ -286,6 +278,17 @@ static int bch2_sb_clean_validate(struct bch_sb *sb,
return -BCH_ERR_invalid_sb_clean;
}
+ for (struct jset_entry *entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
+ prt_str(err, "entry type ");
+ bch2_prt_jset_entry_type(err, le16_to_cpu(entry->type));
+ prt_str(err, " overruns end of section");
+ return -BCH_ERR_invalid_sb_clean;
+ }
+ }
+
return 0;
}
@@ -303,6 +306,9 @@ static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
+ if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
+ break;
+
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf1..a98ef940b7 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
#include "bcachefs.h"
#include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "super-io.h"
@@ -45,7 +45,16 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
+ x(subvolume_fs_parent, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
+ BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
+ x(btree_subvolume_children, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
+ BCH_FSCK_ERR_subvol_children_not_set) \
+ x(mi_btree_bitmap, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_btree_bitmap_not_marked)
#define DOWNGRADE_TABLE()
@@ -253,7 +262,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
if (e < BCH_SB_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
- ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+ __set_bit_le64(e, ext->errors_silent);
}
}
}
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd0..06c7a644f4 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -130,7 +130,7 @@
x(bucket_gens_nonzero_for_invalid_buckets, 122) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
x(need_discard_freespace_key_bad, 124) \
- x(backpointer_pos_wrong, 125) \
+ x(backpointer_bucket_offset_wrong, 125) \
x(backpointer_to_missing_device, 126) \
x(backpointer_to_missing_alloc, 127) \
x(backpointer_to_missing_ptr, 128) \
@@ -231,7 +231,7 @@
x(dirent_name_dot_or_dotdot, 223) \
x(dirent_name_has_slash, 224) \
x(dirent_d_type_wrong, 225) \
- x(dirent_d_parent_subvol_wrong, 226) \
+ x(inode_bi_parent_wrong, 226) \
x(dirent_in_missing_dir_inode, 227) \
x(dirent_in_non_dir_inode, 228) \
x(dirent_to_missing_inode, 229) \
@@ -250,7 +250,29 @@
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245)
+ x(reflink_p_front_pad_bad, 245) \
+ x(journal_entry_dup_same_device, 246) \
+ x(inode_bi_subvol_missing, 247) \
+ x(inode_bi_subvol_wrong, 248) \
+ x(inode_points_to_missing_dirent, 249) \
+ x(inode_points_to_wrong_dirent, 250) \
+ x(inode_bi_parent_nonzero, 251) \
+ x(dirent_to_missing_parent_subvol, 252) \
+ x(dirent_not_visible_in_parent_subvol, 253) \
+ x(subvol_fs_path_parent_wrong, 254) \
+ x(subvol_root_fs_path_parent_nonzero, 255) \
+ x(subvol_children_not_set, 256) \
+ x(subvol_children_bad, 257) \
+ x(subvol_loop, 258) \
+ x(subvol_unreachable, 259) \
+ x(btree_node_bkey_bad_u64s, 260) \
+ x(btree_node_topology_empty_interior_node, 261) \
+ x(btree_ptr_v2_min_key_bad, 262) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263) \
+ x(snapshot_node_missing, 264) \
+ x(dup_backpointer_to_bad_csum_extent, 265) \
+ x(btree_bitmap_not_marked, 266) \
+ x(sb_clean_entry_overrun, 267)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c6..44b3f0cb7b 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "disk_groups.h"
#include "opts.h"
#include "replicas.h"
@@ -123,9 +124,9 @@ static int validate_member(struct printbuf *err,
struct bch_sb *sb,
int i)
{
- if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m.nbuckets), LONG_MAX);
+ if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
+ prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
+ i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
return -BCH_ERR_invalid_sb_members;
}
@@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
+ ptr->offset, btree_sectors(c)))
+ return false;
+ return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+ u64 start, unsigned sectors)
+{
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+ u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+ u64 end = start + sectors;
+
+ int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+ if (resize > 0) {
+ u64 new_bitmap = 0;
+
+ for (unsigned i = 0; i < 64; i++)
+ if (bitmap & BIT_ULL(i))
+ new_bitmap |= BIT_ULL(i >> resize);
+ bitmap = new_bitmap;
+ m->btree_bitmap_shift += resize;
+ }
+
+ for (unsigned bit = start >> m->btree_bitmap_shift;
+ (u64) bit << m->btree_bitmap_shift < end;
+ bit++)
+ bitmap |= BIT_ULL(bit);
+
+ m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+ __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a941832..5bf27d30ca 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_SB_MEMBERS_H
#include "darray.h"
+#include "bkey_types.h"
extern char * const bch2_member_error_strs[];
@@ -106,10 +107,10 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->ref);
- rcu_read_lock();
if ((ca = __bch2_next_dev(c, ca, NULL)))
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -131,10 +132,10 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask)
{
+ rcu_read_lock();
if (ca)
percpu_ref_put(&ca->io_ref);
- rcu_read_lock();
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
@@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
+ .btree_bitmap_shift = mi->btree_bitmap_shift,
+ .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
};
}
@@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+ u64 end = start + sectors;
+
+ if (end > 64ULL << ca->mi.btree_bitmap_shift)
+ return false;
+
+ for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
+ (u64) bit << ca->mi.btree_bitmap_shift < end;
+ bit++)
+ if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+ return false;
+ return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04d55..544322d5c2 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -8,6 +8,7 @@
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include <linux/random.h>
@@ -91,23 +92,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
/* Snapshot nodes: */
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
+ while (id && id < ancestor) {
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ id = s ? s->parent : 0;
+ }
+ return id == ancestor;
+}
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
+ bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
rcu_read_unlock();
- return id == ancestor;
+ return ret;
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return 0;
if (s->skip[2] <= ancestor)
return s->skip[2];
@@ -118,27 +125,36 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- struct snapshot_table *t;
bool ret;
- EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
rcu_read_lock();
- t = rcu_dereference(c->snapshots);
+ struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+ if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
+ ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+ goto out;
+ }
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
-
- EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
+out:
rcu_read_unlock();
return ret;
@@ -147,36 +163,39 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
- size_t new_size;
struct snapshot_table *new, *old;
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
+ size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+ size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+ new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
+ new->nr = new_size;
+
old = rcu_dereference_protected(c->snapshots, true);
if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
+ memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- kvfree_rcu_mightsleep(old);
+ kvfree_rcu(old, rcu);
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ return &rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock))->s[idx];
}
static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
size_t idx = U32_MAX - id;
+ struct snapshot_table *table =
+ rcu_dereference_protected(c->snapshots,
+ lockdep_is_held(&c->snapshot_table_lock));
lockdep_assert_held(&c->snapshot_table_lock);
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+ if (likely(table && idx < table->nr))
+ return &table->s[idx];
return __snapshot_t_mut(c, id);
}
@@ -547,7 +566,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+ fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
c, snapshot_tree_to_wrong_subvol,
@@ -563,6 +582,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
u32 subvol_id;
ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+ bch_err_fn(c, ret);
+
+ if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+ ret = 0;
+ goto err;
+ }
+
if (ret)
goto err;
@@ -720,7 +746,6 @@ static int check_snapshot(struct btree_trans *trans,
u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
u32 real_depth;
struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
u32 i, id;
int ret = 0;
@@ -766,7 +791,7 @@ static int check_snapshot(struct btree_trans *trans,
}
}
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+ bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
!BCH_SNAPSHOT_DELETED(&s);
if (should_have_subvol) {
@@ -868,6 +893,154 @@ int bch2_check_snapshots(struct bch_fs *c)
return ret;
}
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+ struct bch_fs *c = trans->c;
+
+ if (bch2_snapshot_equiv(c, id))
+ return 0;
+
+ u32 tree_id;
+ int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+
+ struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+ ret = PTR_ERR_OR_ZERO(snapshot);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&snapshot->k_i);
+ snapshot->k.p = POS(0, id);
+ snapshot->v.tree = cpu_to_le32(tree_id);
+ snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+
+ return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+ bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+ enum btree_id btree;
+ struct bpos cur_pos;
+ snapshot_id_list cur_ids;
+ DARRAY(snapshot_id_list) trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+ darray_for_each(r->trees, i)
+ darray_exit(i);
+ darray_exit(&r->trees);
+ darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ return r->btree == BTREE_ID_inodes
+ ? r->cur_pos.offset == pos.offset
+ : r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+ darray_for_each(*l, i)
+ if (snapshot_list_has_id(r, *i))
+ return true;
+ return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+ bool first = true;
+ darray_for_each(*s, i) {
+ if (!first)
+ prt_char(out, ' ');
+ first = false;
+ prt_printf(out, "%u", *i);
+ }
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+ if (r->cur_ids.nr) {
+ darray_for_each(r->trees, i)
+ if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+ int ret = snapshot_list_merge(c, i, &r->cur_ids);
+ if (ret)
+ return ret;
+ goto out;
+ }
+ darray_push(&r->trees, r->cur_ids);
+ darray_init(&r->cur_ids);
+ }
+out:
+ r->cur_ids.nr = 0;
+ return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+ if (!same_snapshot(r, pos))
+ snapshot_tree_reconstruct_next(c, r);
+ r->cur_pos = pos;
+ return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct snapshot_tree_reconstruct r = {};
+ int ret = 0;
+
+ for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+ if (btree_type_has_snapshots(btree)) {
+ r.btree = btree;
+
+ ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+ get_snapshot_trees(c, &r, k.k->p);
+ }));
+ if (ret)
+ goto err;
+
+ snapshot_tree_reconstruct_next(c, &r);
+ }
+ }
+
+ darray_for_each(r.trees, t) {
+ printbuf_reset(&buf);
+ snapshot_id_list_to_text(&buf, t);
+
+ darray_for_each(*t, id) {
+ if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+ c, snapshot_node_missing,
+ "snapshot node %u from tree %s missing", *id, buf.buf)) {
+ if (t->nr > 1) {
+ bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+ ret = -BCH_ERR_fsck_repair_unimplemented;
+ goto err;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot_exists(trans, *id));
+ if (ret)
+ goto err;
+ }
+ }
+ }
+fsck_err:
+err:
+ bch2_trans_put(trans);
+ snapshot_tree_reconstruct_exit(&r);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1678,6 +1851,20 @@ int bch2_snapshots_read(struct bch_fs *c)
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
bch_err_fn(c, ret);
+
+ /*
+ * It's important that we check if we need to reconstruct snapshots
+ * before going RW, so we mark that pass as required in the superblock -
+ * otherwise, we could end up deleting keys with missing snapshot nodes
+ * instead
+ */
+ BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+ test_bit(BCH_FS_may_go_rw, &c->flags));
+
+ if (bch2_err_matches(ret, EIO) ||
+ (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+ ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc063..b7d2fed37c 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
{
- return &t->s[U32_MAX - id];
+ u32 idx = U32_MAX - id;
+
+ return likely(t && idx < t->nr)
+ ? &t->s[idx]
+ : NULL;
}
static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
rcu_read_lock();
- id = snapshot_t(c, id)->tree;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ id = s ? s->tree : 0;
rcu_read_unlock();
return id;
@@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->parent : 0;
}
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ if (!s)
+ return 0;
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+ u32 parent = s->parent;
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) &&
+ parent &&
+ s->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
{
- return snapshot_t(c, id)->equiv;
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s ? s->equiv : 0;
}
static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
@@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
return id == bch2_snapshot_equiv(c, id);
}
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- bool ret;
-
rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
+ const struct snapshot_t *s = snapshot_t(c, id);
+ int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
rcu_read_unlock();
return ret;
}
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
+ int ret = bch2_snapshot_is_internal_node(c, id);
+ if (ret < 0)
+ return ret;
+ return !ret;
}
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -218,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- int ret;
-
BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
+ int ret = darray_push(s, id);
if (ret)
bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
return ret;
}
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret = snapshot_list_has_id(s, id)
+ ? 0
+ : darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+ darray_for_each(*src, i) {
+ int ret = snapshot_list_add_nodup(c, dst, *i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s);
int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,7 +260,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
struct bpos pos)
{
if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+ bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
return 0;
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a8887..3976f80721 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
@@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
insert->k.p.inode = inum.inum;
- return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, str_hash_flags, 0);
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_set_in_snapshot(trans, desc, info, inum,
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3e..88a79c8232 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,13 +13,26 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+ if (k.k->type != KEY_TYPE_subvolume)
+ return POS_MIN;
+
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+ if (!s.v->fs_path_parent)
+ return POS_MIN;
+ return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_subvolume subvol;
+ struct btree_iter subvol_children_iter = {};
struct bch_snapshot snapshot;
+ struct printbuf buf = PRINTBUF;
unsigned snapid;
int ret = 0;
@@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+ if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+ subvol.v->fs_path_parent,
+ c, subvol_root_fs_path_parent_nonzero,
+ "root subvolume has nonzero fs_path_parent\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.fs_path_parent = 0;
+ }
+
+ if (subvol.v->fs_path_parent) {
+ struct bpos pos = subvolume_children_pos(k);
+
+ struct bkey_s_c subvol_children_k =
+ bch2_bkey_get_iter(trans, &subvol_children_iter,
+ BTREE_ID_subvolume_children, pos, 0);
+ ret = bkey_err(subvol_children_k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+ c, subvol_children_not_set,
+ "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+ pos.inode, pos.offset,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+ if (ret)
+ goto err;
+ }
+ }
+
+ struct bch_inode_unpacked inode;
+ struct btree_iter inode_iter = {};
+ ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+ 0);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ return ret ?: -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ c, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode_iter.k.p.snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans,
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
}
-
+err:
fsck_err:
+ bch2_trans_iter_exit(trans, &subvol_children_iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
return ret;
}
+static int check_subvol_child(struct btree_trans *trans,
+ struct btree_iter *child_iter,
+ struct bkey_s_c child_k)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_subvolume s;
+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+ 0, subvolume, &s);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+
+ if (fsck_err_on(ret ||
+ le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+ c, subvol_children_bad,
+ "incorrect entry in subvolume_children btree %llu:%llu",
+ child_k.k->p.inode, child_k.k->p.offset)) {
+ ret = bch2_btree_delete_at(trans, child_iter, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol_child(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return 0;
+}
+
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
- prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+ prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+ prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+ }
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+ return !bpos_eq(pos, POS_MIN)
+ ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+ : 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bpos children_pos_old = subvolume_children_pos(old);
+ struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+ if (!bpos_eq(children_pos_old, children_pos_new)) {
+ int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+ subvolume_children_mod(trans, children_pos_new, true);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+ struct btree_iter iter;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+ ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+ : 0;
}
static __always_inline int
@@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
return 0;
s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (ret)
return ret;
- s->v.parent = cpu_to_le32(new_parent);
+ s->v.creation_parent = cpu_to_le32(new_parent);
return 0;
}
@@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.parent)));
+ subvolid_to_delete, le32_to_cpu(s.creation_parent)));
}
/*
@@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 parent_subvolid,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
@@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
if (ret)
goto err;
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.parent = cpu_to_le32(src_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
+ new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
+ new_subvol->v.otime.hi = 0;
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
@@ -434,6 +595,78 @@ err:
return ret;
}
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot_tree root_tree;
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_tree_init(&root_tree.k_i);
+ root_tree.k.p.offset = 1;
+ root_tree.v.master_subvol = cpu_to_le32(1);
+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+ root_snapshot.v.tree = cpu_to_le32(1);
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -BCH_ERR_ENOENT_inode;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
int bch2_fs_subvolumes_init(struct bch_fs *c)
{
INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e2..d2015d549b 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,17 +8,22 @@
enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
+ .trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
})
+int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@@ -30,8 +35,10 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
- u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
int bch2_fs_subvolumes_init(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07..e029df7ba8 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,8 +19,8 @@ struct bch_subvolume {
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
- __le32 parent;
- __le32 pad;
+ __le32 creation_parent;
+ __le32 fs_path_parent;
bch_le128 otime;
};
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc3..9b10c89478 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,6 +20,8 @@ struct snapshot_t {
};
struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
#else
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 36988add58..bfdb15e7d7 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
#include "journal.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev_handle))
- bdev_release(sb->bdev_handle);
+ if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+ bdev_fput(sb->s_bdev_file);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -232,7 +232,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
+ percpu_ref_put(&ca->io_ref);
return NULL;
}
}
@@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return ret;
}
+ if (rw == WRITE &&
+ bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+ prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+ le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+ le64_to_cpu(sb->seq));
+ return -BCH_ERR_invalid_sb_members_missing;
+ }
+
return 0;
}
@@ -519,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext)
+ if (ext) {
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
+ c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ }
for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -639,7 +649,7 @@ reread:
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ if (bytes > 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits)) {
prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
bytes, 512UL << sb->sb->layout.sb_max_size_bits);
return -BCH_ERR_invalid_sb_too_big;
@@ -690,8 +700,11 @@ retry:
return -ENOMEM;
sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name)
- return -ENOMEM;
+ if (!sb->sb_name) {
+ ret = -ENOMEM;
+ prt_printf(&err, "error allocating memory for sb_name");
+ goto err;
+ }
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
@@ -704,22 +717,23 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev_handle) &&
- PTR_ERR(sb->bdev_handle) == -EACCES &&
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->s_bdev_file) &&
+ PTR_ERR(sb->s_bdev_file) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev_handle))
+ sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->s_bdev_file))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev_handle)) {
- ret = PTR_ERR(sb->bdev_handle);
+ if (IS_ERR(sb->s_bdev_file)) {
+ ret = PTR_ERR(sb->s_bdev_file);
+ prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
- sb->bdev = sb->bdev_handle->bdev;
+ sb->bdev = file_bdev(sb->s_bdev_file);
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -743,9 +757,9 @@ retry:
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- printk(KERN_INFO "%s", err2.buf);
+ bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
else
- printk(KERN_ERR "%s", err2.buf);
+ bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
@@ -803,21 +817,20 @@ got_super:
goto err;
}
- ret = 0;
sb->have_layout = true;
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+ path, err.buf);
goto err_no_print;
}
out:
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
+ bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -910,6 +923,7 @@ int bch2_write_super(struct bch_fs *c)
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+ DARRAY(struct bch_dev *) online_devices = {};
int ret = 0;
trace_and_count(c, write_super, c, _RET_IP_);
@@ -922,6 +936,15 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ for_each_online_member(c, ca) {
+ ret = darray_push(&online_devices, ca);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
+ percpu_ref_put(&ca->io_ref);
+ goto out;
+ }
+ percpu_ref_get(&ca->io_ref);
+ }
+
/* Make sure we're using the new magic numbers: */
c->disk_sb.sb->magic = BCHFS_MAGIC;
c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
@@ -929,8 +952,8 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- for_each_online_member(c, ca)
- __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ darray_for_each(online_devices, ca)
+ __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
if (test_bit(BCH_FS_error, &c->flags))
@@ -946,16 +969,15 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(c, ca)
- bch2_sb_from_fs(c, ca);
+ darray_for_each(online_devices, ca)
+ bch2_sb_from_fs(c, (*ca));
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+ ret = bch2_sb_validate(&(*ca)->disk_sb, &err, WRITE);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- percpu_ref_put(&ca->io_ref);
goto out;
}
}
@@ -977,27 +999,29 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, " > ");
bch2_version_to_text(&buf, bcachefs_metadata_version_current);
prt_str(&buf, ")");
- bch2_fs_fatal_error(c, "%s", buf.buf);
+ bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_sb_not_downgraded;
}
- for_each_online_member(c, ca) {
- __set_bit(ca->dev_idx, sb_written.d);
- ca->sb_write_error = 0;
+ darray_for_each(online_devices, ca) {
+ __set_bit((*ca)->dev_idx, sb_written.d);
+ (*ca)->sb_write_error = 0;
}
- for_each_online_member(c, ca)
- read_back_super(c, ca);
+ darray_for_each(online_devices, ca)
+ read_back_super(c, *ca);
closure_sync(cl);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
+
if (ca->sb_write_error)
continue;
if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock write was silently dropped! (seq %llu expected %llu)",
+ ": Superblock write was silently dropped! (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1007,7 +1031,7 @@ int bch2_write_super(struct bch_fs *c)
if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process (seq %llu expected %llu)",
+ ": Superblock modified by another process (seq %llu expected %llu)",
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
@@ -1018,17 +1042,20 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(c, ca)
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
wrote = true;
}
+ }
closure_sync(cl);
sb++;
} while (wrote);
- for_each_online_member(c, ca) {
+ darray_for_each(online_devices, cap) {
+ struct bch_dev *ca = *cap;
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1058,12 +1085,15 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices (from %ps)",
+ ": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
ret = -1;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ darray_for_each(online_devices, ca)
+ percpu_ref_put(&(*ca)->io_ref);
+ darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
}
@@ -1154,6 +1184,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
kfree(errors_silent);
}
+
+ prt_printf(out, "Btrees with missing data:");
+ prt_tab(out);
+ prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+ prt_newline(out);
}
static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825..dddf57ec45 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
#include "btree_gc.h"
#include "btree_journal_iter.h"
#include "btree_key_cache.h"
+#include "btree_node_scan.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
@@ -56,6 +57,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@@ -86,26 +88,38 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+#ifdef __KERNEL__
+ if (unlikely(stdio)) {
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+ return;
+ }
+#endif
+ vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
va_list args;
va_start(args, fmt);
- if (likely(!stdio)) {
- vprintk(fmt, args);
- } else {
- unsigned long flags;
-
- if (fmt[0] == KERN_SOH[0])
- fmt += 2;
+ bch2_print_maybe_redirect(stdio, fmt, args);
+ va_end(args);
+}
- spin_lock_irqsave(&stdio->output_lock, flags);
- prt_vprintf(&stdio->output_buf, fmt, args);
- spin_unlock_irqrestore(&stdio->output_lock, flags);
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
- wake_up(&stdio->output_wait);
- }
+ va_list args;
+ va_start(args, fmt);
+ bch2_print_maybe_redirect(stdio, fmt, args);
va_end(args);
}
@@ -274,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sshutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
@@ -352,7 +371,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- !c->opts.norecovery) {
+ c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -497,7 +516,8 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
- if (c->opts.norecovery)
+ if (c->opts.recovery_pass_last &&
+ c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
return -BCH_ERR_erofs_norecovery;
if (c->opts.nochanges)
@@ -522,7 +542,9 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -546,6 +568,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_put_initial(c);
+ bch2_find_btree_nodes_exit(&c->found_btree_nodes);
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
@@ -576,7 +599,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
- kvpfree(c, sizeof(*c));
+ kvfree(c);
module_put(THIS_MODULE);
}
@@ -715,7 +738,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
- c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+ c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@@ -818,13 +841,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -862,13 +885,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -882,8 +905,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
+ mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -1002,8 +1025,16 @@ int bch2_fs_start(struct bch_fs *c)
for_each_online_member(c, ca)
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ struct bch_sb_field_ext *ext =
+ bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
mutex_unlock(&c->sb_lock);
+ if (!ext) {
+ bch_err(c, "insufficient space in superblock for sb_field_ext");
+ ret = -BCH_ERR_ENOSPC_sb;
+ goto err;
+ }
+
for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -1061,7 +1092,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb)
+ struct bch_sb_handle *sb,
+ struct bch_opts *opts)
{
if (fs == sb)
return 0;
@@ -1102,11 +1134,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
- prt_printf(&buf, "Not using older sb");
+ if (!opts->no_splitbrain_check)
+ prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1124,17 +1159,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, "believes seq of ");
+ prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
+
+ if (!opts->no_splitbrain_check) {
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+ }
pr_err("%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_device_splitbrain;
+
+ if (!opts->no_splitbrain_check)
+ return -BCH_ERR_device_splitbrain;
}
return 0;
@@ -1168,8 +1208,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
- bch2_time_stats_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_exit(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@@ -1260,8 +1300,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
- bch2_time_stats_init(&ca->io_latency[READ]);
- bch2_time_stats_init(&ca->io_latency[WRITE]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+ bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@@ -1597,27 +1637,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "bch2_dev_data_drop()");
if (ret)
goto err;
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
goto err;
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
goto err;
ret = bch2_journal_flush(&c->journal);
- bch_err(ca, "journal error");
+ bch_err_msg(ca, ret, "bch2_journal_flush()");
if (ret)
goto err;
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "bch2_replicas_gc2()");
if (ret)
goto err;
@@ -1835,7 +1875,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@@ -1919,6 +1959,13 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
goto err;
}
+ if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
+ bch_err(ca, "New device size too big (%llu greater than max %u)",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ ret = -BCH_ERR_device_size_too_big;
+ goto err;
+ }
+
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
@@ -1964,13 +2011,9 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- if (!strcmp(name, ca->name)) {
- rcu_read_unlock();
+ for_each_member_device(c, ca)
+ if (!strcmp(name, ca->name))
return ca;
- }
- rcu_read_unlock();
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
@@ -2023,7 +2066,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb);
+ ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e..11bcef170c 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
- struct bdev_handle *bdev_handle;
+ struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
@@ -37,6 +37,8 @@ struct bch_member_cpu {
u8 durability;
u8 freespace_initialized;
u8 valid;
+ u8 btree_bitmap_shift;
+ u64 btree_allocated_bitmap;
};
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47fe..5be92fe3f4 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -26,6 +25,7 @@
#include "ec.h"
#include "inode.h"
#include "journal.h"
+#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
@@ -139,6 +139,7 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
write_attribute(prune_cache);
write_attribute(btree_wakeup);
rw_attribute(btree_gc_periodic);
@@ -166,7 +167,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
@@ -415,9 +415,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
bch2_btree_cache_to_text(out, c);
@@ -505,7 +502,7 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_rw, &c->flags))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -538,6 +535,11 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_flush) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_meta(&c->journal);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +560,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -639,7 +642,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
@@ -657,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_flush,
&sysfs_prune_cache,
&sysfs_btree_wakeup,
@@ -930,10 +933,10 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+ bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc577..bfec656f94 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10..b1af7ac430 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
-#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
@@ -10,6 +9,7 @@
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
+#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
@@ -65,68 +65,82 @@ err:
return ret;
}
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
- return thr->stdio.output_buf.pos ||
- thr->output2.nr ||
- thr->thr.done;
+ return stdio->input.buf.nr || stdio->done;
}
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- size_t copied = 0, b;
- int ret = 0;
+ return stdio->output.buf.nr || stdio->done;
+}
- if ((file->f_flags & O_NONBLOCK) &&
- !thread_with_stdio_has_output(thr))
- return -EAGAIN;
+#define STDIO_REDIRECT_BUFSIZE 4096
- ret = wait_event_interruptible(thr->stdio.output_wait,
- thread_with_stdio_has_output(thr));
- if (ret)
- return ret;
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+ return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- if (thr->thr.done)
- return 0;
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+ return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
- while (len) {
- ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
- if (ret)
- break;
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+ spin_lock_init(&buf->lock);
+ init_waitqueue_head(&buf->wait);
+ darray_init(&buf->buf);
+}
- spin_lock_irq(&thr->stdio.output_lock);
- b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+/* thread_with_stdio */
- memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
- memmove(thr->stdio.output_buf.buf,
- thr->stdio.output_buf.buf + b,
- thr->stdio.output_buf.pos - b);
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input.wait);
+ wake_up(&thr->stdio.output.wait);
+}
- thr->output2.nr += b;
- thr->stdio.output_buf.pos -= b;
- spin_unlock_irq(&thr->stdio.output_lock);
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct stdio_buf *buf = &thr->stdio.output;
+ size_t copied = 0, b;
+ int ret = 0;
- b = min(len, thr->output2.nr);
- if (!b)
- break;
+ if (!(file->f_flags & O_NONBLOCK)) {
+ ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+ if (ret)
+ return ret;
+ } else if (!stdio_redirect_has_output(&thr->stdio))
+ return -EAGAIN;
- b -= copy_to_user(buf, thr->output2.data, b);
- if (!b) {
+ while (len && buf->buf.nr) {
+ if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
- copied += b;
- buf += b;
- len -= b;
-
- memmove(thr->output2.data,
- thr->output2.data + b,
- thr->output2.nr - b);
- thr->output2.nr -= b;
+ spin_lock_irq(&buf->lock);
+ b = min_t(size_t, len, buf->buf.nr);
+
+ if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->buf.nr -= b;
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ }
+ spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
@@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
+ thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
- printbuf_exit(&thr->stdio.input_buf);
- printbuf_exit(&thr->stdio.output_buf);
- darray_exit(&thr->output2);
- thr->exit(thr);
+ darray_exit(&thr->stdio.input.buf);
+ darray_exit(&thr->stdio.output.buf);
+ thr->ops->exit(thr);
return 0;
}
-#define WRITE_BUFFER 4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
- return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- struct printbuf *buf = &thr->stdio.input_buf;
+ struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
@@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
break;
}
- spin_lock(&thr->stdio.input_lock);
- if (buf->pos < WRITE_BUFFER)
- bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
- b = min(len, printbuf_remaining_size(buf));
-
- if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->pos += b;
+ spin_lock(&buf->lock);
+ if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+ darray_make_room_gfp(&buf->buf,
+ min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+ b = min(len, darray_room(buf->buf));
+
+ if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+ buf->buf.nr += b;
+ ubuf += b;
+ len -= b;
+ copied += b;
}
- spin_unlock(&thr->stdio.input_lock);
+ spin_unlock(&buf->lock);
if (b) {
- wake_up(&thr->stdio.input_wait);
+ wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
- ret = wait_event_interruptible(thr->stdio.input_wait,
- thread_with_stdio_has_input_space(thr));
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
@@ -209,90 +217,244 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
- poll_wait(file, &thr->stdio.output_wait, wait);
- poll_wait(file, &thr->stdio.input_wait, wait);
+ poll_wait(file, &thr->stdio.output.wait, wait);
+ poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
- if (thread_with_stdio_has_output(thr))
+ if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
- if (thread_with_stdio_has_input_space(thr))
+ if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output.wait, wait);
+
+ __poll_t mask = 0;
+
+ if (stdio_redirect_has_output(&thr->stdio))
+ mask |= EPOLLIN;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ if (thr->ops->unlocked_ioctl)
+ return thr->ops->unlocked_ioctl(thr, cmd, p);
+ return -ENOTTY;
+}
+
static const struct file_operations thread_with_stdio_fops = {
- .release = thread_with_stdio_release,
+ .llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
.llseek = no_llseek,
+ .read = thread_with_stdio_read,
+ .poll = thread_with_stdout_poll,
+ .flush = thread_with_stdio_flush,
+ .release = thread_with_stdio_release,
+ .unlocked_ioctl = thread_with_stdio_ioctl,
};
+static int thread_with_stdio_fn(void *arg)
+{
+ struct thread_with_stdio *thr = arg;
+
+ thr->thr.ret = thr->ops->fn(thr);
+
+ thread_with_stdio_done(thr);
+ return 0;
+}
+
+void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
+}
+
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
+{
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
+
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *))
+ const struct thread_with_stdio_ops *ops)
{
- thr->stdio.input_buf = PRINTBUF;
- thr->stdio.input_buf.atomic++;
- spin_lock_init(&thr->stdio.input_lock);
- init_waitqueue_head(&thr->stdio.input_wait);
+ bch2_thread_with_stdio_init(thr, ops);
- thr->stdio.output_buf = PRINTBUF;
- thr->stdio.output_buf.atomic++;
- spin_lock_init(&thr->stdio.output_lock);
- init_waitqueue_head(&thr->stdio.output_wait);
+ return __bch2_run_thread_with_stdio(thr);
+}
- darray_init(&thr->output2);
- thr->exit = exit;
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+ const struct thread_with_stdio_ops *ops)
+{
+ stdio_buf_init(&thr->stdio.input);
+ stdio_buf_init(&thr->stdio.output);
+ thr->ops = ops;
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
+ struct stdio_buf *buf = &stdio->input;
+
+ /*
+ * we're waiting on user input (or for the file descriptor to be
+ * closed), don't want a hung task warning:
+ */
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
+ spin_lock(&buf->lock);
+ int ret = min(len, buf->buf.nr);
+ buf->buf.nr -= ret;
+ memcpy(ubuf, buf->buf.data, ret);
+ memmove(buf->buf.data,
+ buf->buf.data + ret,
+ buf->buf.nr);
+ spin_unlock(&buf->lock);
- wake_up(&stdio->input_wait);
+ wake_up(&buf->wait);
return ret;
}
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
- wait_event(stdio->input_wait,
- stdio->input_buf.pos || stdio->done);
-
- if (stdio->done)
- return -1;
+ struct stdio_buf *buf = &stdio->input;
+ size_t copied = 0;
+ ssize_t ret = 0;
+again:
+ do {
+ wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+ sysctl_hung_task_timeout_secs * HZ / 2);
+ } while (!stdio_redirect_has_input(stdio));
+
+ if (stdio->done) {
+ ret = -1;
+ goto out;
+ }
- spin_lock(&stdio->input_lock);
- int ret = min(len, stdio->input_buf.pos);
- char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ spin_lock(&buf->lock);
+ size_t b = min(len, buf->buf.nr);
+ char *n = memchr(buf->buf.data, '\n', b);
if (n)
- ret = min(ret, n + 1 - stdio->input_buf.buf);
- stdio->input_buf.pos -= ret;
- memcpy(buf, stdio->input_buf.buf, ret);
- memmove(stdio->input_buf.buf,
- stdio->input_buf.buf + ret,
- stdio->input_buf.pos);
- spin_unlock(&stdio->input_lock);
-
- wake_up(&stdio->input_wait);
+ b = min_t(size_t, b, n + 1 - buf->buf.data);
+ buf->buf.nr -= b;
+ memcpy(ubuf, buf->buf.data, b);
+ memmove(buf->buf.data,
+ buf->buf.data + b,
+ buf->buf.nr);
+ ubuf += b;
+ len -= b;
+ copied += b;
+ spin_unlock(&buf->lock);
+
+ wake_up(&buf->wait);
+
+ if (!n && len)
+ goto again;
+out:
+ return copied ?: ret;
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+ ssize_t ret;
+
+ do {
+ va_list args2;
+ size_t len;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+ va_end(args2);
+
+ if (len + 1 <= darray_room(*out)) {
+ out->nr += len;
+ return len;
+ }
+
+ ret = darray_make_room_gfp(out, len + 1, gfp);
+ } while (ret == 0);
+
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, va_list args)
+{
+ struct stdio_buf *buf = &stdio->output;
+ unsigned long flags;
+ ssize_t ret;
+
+again:
+ spin_lock_irqsave(&buf->lock, flags);
+ ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+ spin_unlock_irqrestore(&buf->lock, flags);
+
+ if (ret < 0) {
+ if (nonblocking)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(buf->wait,
+ stdio_redirect_has_output_space(stdio));
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ wake_up(&buf->wait);
+ return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+ const char *fmt, ...)
+{
+ va_list args;
+ ssize_t ret;
+
+ va_start(args, fmt);
+ ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+ va_end(args);
+
return ret;
}
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048..1d63d14d7d 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
#include "thread_with_file_types.h"
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ * - kthread shutdown behaves like writing or reading from a pipe that has been
+ * closed
+ * - Input and output buffers are 4096 bytes, although buffers may in some
+ * situations slightly exceed that limit so as to avoid chopping off a
+ * message in the middle in nonblocking mode.
+ * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ * should be fine but might change in future revisions.
+ * - Output buffer may grow past 4096 bytes to deal with messages that are
+ * bigger than 4096 bytes
+ * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ * drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
struct task_struct;
struct thread_with_file {
@@ -17,25 +49,31 @@ int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+ void (*exit)(struct thread_with_stdio *);
+ int (*fn)(struct thread_with_stdio *);
+ long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
- DARRAY(char) output2;
- void (*exit)(struct thread_with_stdio *);
+ const struct thread_with_stdio_ops *ops;
};
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input_wait);
- wake_up(&thr->stdio.output_wait);
-}
-
+void bch2_thread_with_stdio_init(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
+int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- void (*exit)(struct thread_with_stdio *),
- int (*fn)(void *));
+ const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+ const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e9..e0daf4eec3 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,21 @@
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#include "darray.h"
+
+struct stdio_buf {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ darray_char buf;
+};
+
struct stdio_redirect {
- spinlock_t output_lock;
- wait_queue_head_t output_wait;
- struct printbuf output_buf;
+ struct stdio_buf input;
+ struct stdio_buf output;
spinlock_t input_lock;
wait_queue_head_t input_wait;
- struct printbuf input_buf;
+ darray_char input_buf;
bool done;
};
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
new file mode 100644
index 0000000000..4508e9dcbe
--- /dev/null
+++ b/fs/bcachefs/time_stats.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "d", (u64) NSEC_PER_SEC * 3600 * 24},
+ { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+ { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+ { "eon", U64_MAX },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+ bool initted = stats->last_event != 0;
+
+ if (time_after64(end, start)) {
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+ duration, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
+
+ if (quantiles)
+ quantiles_update(quantiles, duration);
+ }
+
+ if (stats->last_event && time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+ freq, initted, TIME_STATS_MV_WEIGHT);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ }
+
+ stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ for (struct time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ __bch2_time_stats_clear_buffer(stats, b);
+ spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
new file mode 100644
index 0000000000..5df6140374
--- /dev/null
+++ b/fs/bcachefs/time_stats.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ * everywhere without worrying about overhead
+ *
+ * tracks:
+ * - number of events
+ * - maximum event duration ever seen
+ * - sum of all event durations
+ * - average event duration, standard and weighted
+ * - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+ const char *name;
+ u64 nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[31];
+};
+
+struct bch2_time_stats {
+ spinlock_t lock;
+ bool have_quantiles;
+ /* all fields are in nanoseconds */
+ u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ u64 last_event_start;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT 8
+
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct time_stat_buffer __percpu *buffer;
+};
+
+struct bch2_time_stats_quantiles {
+ struct bch2_time_stats stats;
+ struct quantiles quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+ return stats->have_quantiles
+ ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+ : NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats - bch2_time_stats to update
+ * @start - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+ __bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats - bch2_time_stats to update
+ * @v - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+ if (v != !!stats->last_event_start) {
+ if (!v) {
+ bch2_time_stats_update(stats, stats->last_event_start);
+ stats->last_event_start = 0;
+ } else {
+ stats->last_event_start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+ bch2_time_stats_init(&statq->stats);
+ statq->stats.have_quantiles = true;
+ memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 293b90d704..6aa81d1e6d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3a32faa86b..92c6ad75e7 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct bch2_quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
-
- if (time_after64(end, start)) {
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
- bch2_quantiles_update(&stats->quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- for (struct bch2_time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- WARN_ONCE(!stats->duration_stats_weighted.weight ||
- !stats->freq_stats_weighted.weight,
- "uninitialized time_stats");
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- bch2_time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct bch2_time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct bch2_time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- bch2_time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
-
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
- const struct time_unit *u = pick_time_units(ns);
+ const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
prt_tab_rjust(out);
@@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
- const struct time_unit *u;
+ struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
- int i;
+ u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
@@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
@@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
- i = eytzinger0_first(NR_QUANTILES);
- u = pick_time_units(stats->quantiles.entries[i].m);
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
- q = max(stats->quantiles.entries[i].m, last_q);
- prt_printf(out, "%llu ",
- div_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
+ if (quantiles) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ bch2_pick_time_units(quantiles->entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(quantiles->entries[i].m, last_q);
+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
}
}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->duration_stats_weighted.weight = 8;
- stats->freq_stats_weighted.weight = 8;
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
/* ratelimit: */
@@ -864,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t) pool_data;
-
- return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return size < PAGE_SIZE
- ? mempool_init_kmalloc_pool(pool, min_nr, size)
- : mempool_init(pool, min_nr, mempool_alloc_vp,
- mempool_free_vp, (void *) size);
-}
-
#if 0
void eytzinger1_test(void)
{
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59..5cf885b099 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -21,6 +21,7 @@
#include "mean_and_variance.h"
#include "darray.h"
+#include "time_stats.h"
struct closure;
@@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void vpfree(void *p, size_t size)
-{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
- return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
- get_order(size)) ?:
- __vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
- if (size < PAGE_SIZE)
- kfree(p);
- else
- vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
- return size < PAGE_SIZE
- ? kmalloc(size, gfp_mask)
- : vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
#define HEAP(type) \
struct { \
size_t size, used; \
@@ -97,13 +66,13 @@ struct { \
({ \
(heap)->used = 0; \
(heap)->size = (_size); \
- (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+ (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
- kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
+ kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
- struct bch2_quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
- unsigned nr;
- struct bch2_time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[32];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- struct bch2_quantiles quantiles;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance freq_stats;
- struct mean_and_variance_weighted freq_stats_weighted;
- struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- if (v != !!*start) {
- if (!v) {
- bch2_time_stats_update(stats, *start);
- *start = 0;
- } else {
- *start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- bool ret = v && !*start;
- *start = v;
- return ret;
-}
-#endif
-
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
#define ewma_add(ewma, val, weight) \
({ \
typeof(ewma) _ewma = (ewma); \
@@ -738,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
@@ -788,8 +677,15 @@ static inline void __move_gap(void *array, size_t element_size,
}
/* Move the gap in a gap buffer: */
-#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
- __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+#define move_gap(_d, _new_gap) \
+do { \
+ BUG_ON(_new_gap > (_d)->nr); \
+ BUG_ON((_d)->gap > (_d)->nr); \
+ \
+ __move_gap((_d)->data, sizeof((_d)->data[0]), \
+ (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
+ (_d)->gap = _new_gap; \
+} while (0)
#define bubble_sort(_base, _nr, _cmp) \
do { \
@@ -876,4 +772,43 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+ return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+ return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9c0d231603..754f17bba6 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
- return ret;
+ goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
- return ret;
+ goto err_class_exit;
s.v = v + 1;
s.defined = true;
@@ -595,6 +595,7 @@ err:
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
+err_class_exit:
return bch2_err_class(ret);
}