summaryrefslogtreecommitdiffstats
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Kconfig18
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c573
-rw-r--r--fs/bcachefs/alloc_background.h39
-rw-r--r--fs/bcachefs/alloc_background_format.h92
-rw-r--r--fs/bcachefs/alloc_foreground.c53
-rw-r--r--fs/bcachefs/backpointers.c259
-rw-r--r--fs/bcachefs/backpointers.h28
-rw-r--r--fs/bcachefs/bcachefs.h209
-rw-r--r--fs/bcachefs/bcachefs_format.h1005
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h60
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c9
-rw-r--r--fs/bcachefs/bkey_methods.h88
-rw-r--r--fs/bcachefs/bset.c13
-rw-r--r--fs/bcachefs/bset.h3
-rw-r--r--fs/bcachefs/btree_cache.c40
-rw-r--r--fs/bcachefs/btree_cache.h23
-rw-r--r--fs/bcachefs/btree_gc.c363
-rw-r--r--fs/bcachefs/btree_io.c170
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c947
-rw-r--r--fs/bcachefs/btree_iter.h412
-rw-r--r--fs/bcachefs/btree_journal_iter.c25
-rw-r--r--fs/bcachefs/btree_key_cache.c63
-rw-r--r--fs/bcachefs/btree_key_cache.h2
-rw-r--r--fs/bcachefs/btree_locking.c149
-rw-r--r--fs/bcachefs/btree_locking.h25
-rw-r--r--fs/bcachefs/btree_trans_commit.c342
-rw-r--r--fs/bcachefs/btree_types.h148
-rw-r--r--fs/bcachefs/btree_update.c245
-rw-r--r--fs/bcachefs/btree_update.h111
-rw-r--r--fs/bcachefs/btree_update_interior.c333
-rw-r--r--fs/bcachefs/btree_update_interior.h53
-rw-r--r--fs/bcachefs/btree_write_buffer.c669
-rw-r--r--fs/bcachefs/btree_write_buffer.h53
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h63
-rw-r--r--fs/bcachefs/buckets.c1509
-rw-r--r--fs/bcachefs/buckets.h60
-rw-r--r--fs/bcachefs/buckets_types.h17
-rw-r--r--fs/bcachefs/chardev.c362
-rw-r--r--fs/bcachefs/checksum.h23
-rw-r--r--fs/bcachefs/compress.c4
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/darray.h8
-rw-r--r--fs/bcachefs/data_update.c36
-rw-r--r--fs/bcachefs/debug.c157
-rw-r--r--fs/bcachefs/dirent.c51
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/dirent_format.h42
-rw-r--r--fs/bcachefs/disk_groups.c13
-rw-r--r--fs/bcachefs/ec.c406
-rw-r--r--fs/bcachefs/ec.h5
-rw-r--r--fs/bcachefs/ec_format.h19
-rw-r--r--fs/bcachefs/ec_types.h2
-rw-r--r--fs/bcachefs/errcode.h6
-rw-r--r--fs/bcachefs/error.c103
-rw-r--r--fs/bcachefs/extent_update.c2
-rw-r--r--fs/bcachefs/extents.c15
-rw-r--r--fs/bcachefs/extents.h26
-rw-r--r--fs/bcachefs/extents_format.h295
-rw-r--r--fs/bcachefs/eytzinger.h14
-rw-r--r--fs/bcachefs/fs-common.c36
-rw-r--r--fs/bcachefs/fs-io-buffered.c59
-rw-r--r--fs/bcachefs/fs-io-direct.c5
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-io.c27
-rw-r--r--fs/bcachefs/fs-ioctl.c16
-rw-r--r--fs/bcachefs/fs.c111
-rw-r--r--fs/bcachefs/fs.h9
-rw-r--r--fs/bcachefs/fsck.c647
-rw-r--r--fs/bcachefs/inode.c154
-rw-r--r--fs/bcachefs/inode.h15
-rw-r--r--fs/bcachefs/inode_format.h166
-rw-r--r--fs/bcachefs/io_misc.c59
-rw-r--r--fs/bcachefs/io_read.c50
-rw-r--r--fs/bcachefs/io_write.c51
-rw-r--r--fs/bcachefs/journal.c211
-rw-r--r--fs/bcachefs/journal.h4
-rw-r--r--fs/bcachefs/journal_io.c162
-rw-r--r--fs/bcachefs/journal_reclaim.c130
-rw-r--r--fs/bcachefs/journal_reclaim.h16
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/journal_types.h16
-rw-r--r--fs/bcachefs/keylist.c2
-rw-r--r--fs/bcachefs/keylist.h4
-rw-r--r--fs/bcachefs/logged_ops.c18
-rw-r--r--fs/bcachefs/logged_ops_format.h30
-rw-r--r--fs/bcachefs/lru.c11
-rw-r--r--fs/bcachefs/mean_and_variance.c10
-rw-r--r--fs/bcachefs/mean_and_variance.h3
-rw-r--r--fs/bcachefs/mean_and_variance_test.c28
-rw-r--r--fs/bcachefs/migrate.c9
-rw-r--r--fs/bcachefs/move.c250
-rw-r--r--fs/bcachefs/move.h12
-rw-r--r--fs/bcachefs/movinggc.c49
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h29
-rw-r--r--fs/bcachefs/printbuf.c1
-rw-r--r--fs/bcachefs/quota.c28
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/rebalance.c34
-rw-r--r--fs/bcachefs/recovery.c304
-rw-r--r--fs/bcachefs/recovery.h1
-rw-r--r--fs/bcachefs/recovery_types.h25
-rw-r--r--fs/bcachefs/reflink.c243
-rw-r--r--fs/bcachefs/reflink.h26
-rw-r--r--fs/bcachefs/reflink_format.h33
-rw-r--r--fs/bcachefs/replicas.c84
-rw-r--r--fs/bcachefs/replicas.h22
-rw-r--r--fs/bcachefs/replicas_types.h6
-rw-r--r--fs/bcachefs/sb-clean.c22
-rw-r--r--fs/bcachefs/sb-counters.c (renamed from fs/bcachefs/counters.c)2
-rw-r--r--fs/bcachefs/sb-counters.h (renamed from fs/bcachefs/counters.h)7
-rw-r--r--fs/bcachefs/sb-counters_format.h98
-rw-r--r--fs/bcachefs/sb-downgrade.c90
-rw-r--r--fs/bcachefs/sb-downgrade.h1
-rw-r--r--fs/bcachefs/sb-errors_types.h4
-rw-r--r--fs/bcachefs/sb-members.c24
-rw-r--r--fs/bcachefs/sb-members.h100
-rw-r--r--fs/bcachefs/six.c117
-rw-r--r--fs/bcachefs/six.h13
-rw-r--r--fs/bcachefs/snapshot.c178
-rw-r--r--fs/bcachefs/snapshot.h8
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/str_hash.h47
-rw-r--r--fs/bcachefs/subvolume.c31
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/subvolume_types.h4
-rw-r--r--fs/bcachefs/super-io.c168
-rw-r--r--fs/bcachefs/super-io.h7
-rw-r--r--fs/bcachefs/super.c398
-rw-r--r--fs/bcachefs/super.h6
-rw-r--r--fs/bcachefs/super_types.h3
-rw-r--r--fs/bcachefs/sysfs.c173
-rw-r--r--fs/bcachefs/tests.c193
-rw-r--r--fs/bcachefs/thread_with_file.c299
-rw-r--r--fs/bcachefs/thread_with_file.h41
-rw-r--r--fs/bcachefs/thread_with_file_types.h16
-rw-r--r--fs/bcachefs/trace.h318
-rw-r--r--fs/bcachefs/util.c206
-rw-r--r--fs/bcachefs/util.h59
-rw-r--r--fs/bcachefs/vstructs.h10
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/bcachefs/xattr_format.h19
146 files changed, 8616 insertions, 7306 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index fddc7be580..5cdfef3b55 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
depends on BCACHEFS_FS
select FS_POSIX_ACL
-config BCACHEFS_DEBUG_TRANSACTIONS
- bool "bcachefs runtime info"
- depends on BCACHEFS_FS
- help
- This makes the list of running btree transactions available in debugfs.
-
- This is a highly useful debugging feature but does add a small amount of overhead.
-
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
@@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT
help
This disables device latency tracking and time stats, only for performance testing
+config BCACHEFS_SIX_OPTIMISTIC_SPIN
+ bool "Optimistic spinning for six locks"
+ depends on BCACHEFS_FS
+ depends on SMP
+ default y
+ help
+ Instead of immediately sleeping when attempting to take a six lock that
+ is held by another thread, spin for a short while, as long as the
+ thread owning the lock is running.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b812684181..1a05cecda7 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
- counters.o \
darray.o \
debug.o \
dirent.o \
@@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \
@@ -82,6 +82,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ thread_with_file.o \
trace.o \
two_state_shared_lock.o \
util.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1fec0e6789..fd3e175d83 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe, c, err,
- alloc_key_empty_but_have_data,
+ bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
+ c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
case BCH_DATA_sb:
@@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
- alloc_key_dirty_sectors_0,
+ bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
+ c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
+ bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
- a.v->dirty_sectors ||
- a.v->stripe, c, err,
- alloc_key_cached_inconsistency,
+ bch2_bucket_sectors_dirty(*a.v) ||
+ a.v->stripe,
+ c, err, alloc_key_cached_inconsistency,
"data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, err,
- alloc_key_cached_but_read_time_zero,
+ c, err, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
@@ -324,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "gen %u oldest_gen %u data_type %s",
- a->gen, a->oldest_gen,
- a->data_type < BCH_DATA_NR
- ? bch2_data_types[a->data_type]
- : "(invalid data type)");
+ prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+ bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@@ -356,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- prt_newline(out);
-
- if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
- struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
- const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
- prt_newline(out);
- bch2_backpointer_to_text(out, &bps[i]);
- }
-
- printbuf_indent_sub(out, 2);
- }
-
printbuf_indent_sub(out, 2);
}
@@ -537,18 +513,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
int bch2_bucket_gens_init(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 a;
struct bkey_i_bucket_gens g;
bool have_bucket_gens_key = false;
- unsigned offset;
- struct bpos pos;
- u8 gen;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -556,13 +526,14 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- gen = bch2_alloc_to_v4(k, &a)->gen;
- pos = alloc_gens_pos(iter.pos, &offset);
+ struct bch_alloc_v4 a;
+ u8 gen = bch2_alloc_to_v4(k, &a)->gen;
+ unsigned offset;
+ struct bpos pos = alloc_gens_pos(iter.pos, &offset);
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
@@ -576,45 +547,37 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
- const struct bch_bucket_gens *g;
- u64 b;
-
- for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- g = bkey_s_c_to_bucket_gens(k).v;
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
/*
* Not a fsck error because this is checked/repaired by
@@ -623,19 +586,17 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_exists2(c, k.k->p.inode))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- for (b = max_t(u64, ca->mi.first_bucket, start);
+ for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
b++)
*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
} else {
- struct bch_alloc_v4 a;
-
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -643,19 +604,18 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
}
bch2_trans_put(trans);
up_read(&c->gc_lock);
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
return ret;
}
@@ -768,83 +728,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_alloc(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_alloc(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, *new_a;
- const struct bch_alloc_v4 *old_a;
- u64 old_lru, new_lru;
int ret = 0;
- /*
- * Deletion only happens in the device removal path, with
- * BTREE_TRIGGER_NORUN:
- */
- BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = &bkey_i_to_alloc_v4(new)->v;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ struct bch_alloc_v4 old_a_convert;
+ const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (new_a->dirty_sectors > old_a->dirty_sectors ||
- new_a->cached_sectors > old_a->cached_sectors) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
- }
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- if (data_type_is_empty(new_a->data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
- !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
- new_a->gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
- }
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
- if (old_a->data_type != new_a->data_type ||
- (new_a->data_type == BCH_DATA_free &&
- alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
- if (ret)
- return ret;
- }
+ if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+ }
- if (new_a->data_type == BCH_DATA_cached &&
- !new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
+
+ if (old_a->data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ if (ret)
+ return ret;
+ }
- old_lru = alloc_lru_idx_read(*old_a);
- new_lru = alloc_lru_idx_read(*new_a);
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans, new->k.p.inode,
- bucket_to_u64(new->k.p),
- old_lru, new_lru);
- if (ret)
- return ret;
+ u64 old_lru = alloc_lru_idx_read(*old_a);
+ u64 new_lru = alloc_lru_idx_read(*new_a);
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new.k->p.inode,
+ bucket_to_u64(new.k->p),
+ old_lru, new_lru);
+ if (ret)
+ return ret;
+ }
+
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+ bch_dev_bkey_exists(c, new.k->p.inode));
+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ ret = bch2_lru_change(trans,
+ BCH_LRU_FRAGMENTATION_START,
+ bucket_to_u64(new.k->p),
+ old_a->fragmentation_lru, new_a->fragmentation_lru);
+ if (ret)
+ return ret;
+ }
+
+ if (old_a->gen != new_a->gen) {
+ ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ old_a->cached_sectors) {
+ ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
+ -((s64) old_a->cached_sectors));
+ if (ret)
+ return ret;
+ }
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new->k.p.inode));
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+ u64 journal_seq = trans->journal_res.seq;
+ u64 bucket_journal_seq = new_a->journal_seq;
- if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
- ret = bch2_lru_change(trans,
- BCH_LRU_FRAGMENTATION_START,
- bucket_to_u64(new->k.p),
- old_a->fragmentation_lru, new_a->fragmentation_lru);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ data_type_is_empty(old_a->data_type) !=
+ data_type_is_empty(new_a->data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ v->journal_seq = bucket_journal_seq =
+ data_type_is_empty(new_a->data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+ ? 0 : journal_seq;
+ }
+
+ if (!data_type_is_empty(old_a->data_type) &&
+ data_type_is_empty(new_a->data_type) &&
+ bucket_journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ bucket_journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
+
+ percpu_down_read(&c->mark_lock);
+ if (new_a->gen != old_a->gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+
+ if (new_a->data_type == BCH_DATA_free &&
+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a->data_type == BCH_DATA_need_discard &&
+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a->data_type != BCH_DATA_cached &&
+ new_a->data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a->data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+ percpu_up_read(&c->mark_lock);
}
- if (old_a->gen != new_a->gen) {
- ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_GC) &&
+ (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
+ struct bch_alloc_v4 new_a_convert;
+ const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ g->data_type = new_a->data_type;
+ g->stripe = new_a->stripe;
+ g->stripe_redundancy = new_a->stripe_redundancy;
+ g->dirty_sectors = new_a->dirty_sectors;
+ g->cached_sectors = new_a->cached_sectors;
+
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
}
return 0;
@@ -869,8 +923,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
bch2_trans_copy_iter(&iter2, iter);
- if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
- end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+ struct btree_path *path = btree_iter_path(iter->trans, iter);
+ if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
+ end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
@@ -898,7 +953,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
{
struct bch_dev *ca;
- unsigned iter;
if (bch2_dev_bucket_exists(c, *bucket))
return true;
@@ -916,8 +970,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
}
rcu_read_lock();
- iter = bucket->inode;
- ca = __bch2_next_dev(c, &iter, NULL);
+ ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (ca)
*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
rcu_read_unlock();
@@ -1158,9 +1211,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- if (c->sb.version < bcachefs_metadata_version_bucket_gens)
- return 0;
-
bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
k = bch2_btree_iter_peek_slot(bucket_gens_iter);
@@ -1212,7 +1262,7 @@ fsck_err:
return ret;
}
-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bch_fs *c = trans->c;
@@ -1267,28 +1317,10 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
goto out;
}
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end)
-{
- if (!btree_id_is_extents(iter->btree_id)) {
- return __bch2_check_discard_freespace_key(trans, iter);
- } else {
- int ret = 0;
-
- while (!bkey_eq(iter->pos, end) &&
- !(ret = btree_trans_too_many_iters(trans) ?:
- __bch2_check_discard_freespace_key(trans, iter)))
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
- return ret;
- }
-}
-
/*
* We've already checked that generation numbers in the bucket_gens btree are
* valid for buckets that exist; this just checks for keys for nonexistent
@@ -1422,8 +1454,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
}
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1442,23 +1473,50 @@ bkey_err:
if (ret < 0)
goto err;
- ret = for_each_btree_key2(trans, iter,
+ ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key2(trans, iter,
- BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key_commit(trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter));
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k) ?:
+ bch2_check_discard_freespace_key(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err(c, "while checking %s", buf.buf);
+ printbuf_exit(&buf);
+ break;
+ }
+
+ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1486,6 +1544,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (a->data_type != BCH_DATA_cached)
return 0;
+ if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+
+ a = &a_mut->v;
+ }
+
lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
lru_pos(alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
@@ -1494,41 +1573,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
return ret;
- if (fsck_err_on(!a->io_time[READ], c,
- alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
- fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- u64 read_time = a->io_time[READ] ?:
- atomic64_read(&c->io_clock[READ].now);
-
ret = bch2_lru_set(trans,
alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
- read_time);
+ a->io_time[READ]);
if (ret)
goto err;
-
- if (a->io_time[READ] != read_time) {
- struct bkey_i_alloc_v4 *a_mut =
- bch2_alloc_to_v4_mut(trans, alloc_k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- goto err;
-
- a_mut->v.io_time[READ] = read_time;
- ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
- if (ret)
- goto err;
- }
}
err:
fsck_err:
@@ -1539,27 +1595,45 @@ fsck_err:
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 discarded;
+ struct bch_dev *ca;
+ u64 need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+ if (s->ca == ca)
+ return;
+
+ if (s->ca && s->need_journal_commit_this_dev >
+ bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ s->ca = ca;
+ s->need_journal_commit_this_dev = 0;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- u64 *seen,
- u64 *open,
- u64 *need_journal_commit,
- u64 *discarded)
+ struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1571,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
+
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
+ discard_buckets_next_dev(c, s, ca);
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- (*open)++;
+ s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
- (*need_journal_commit)++;
+ s->need_journal_commit++;
+ s->need_journal_commit_this_dev++;
goto out;
}
@@ -1637,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
*/
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(trans);
blkdev_issue_discard(ca->disk_sb.bdev,
k.k->p.offset * ca->mi.bucket_size,
ca->mi.bucket_size,
@@ -1655,14 +1733,14 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
- this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
- (*discarded)++;
+ count_event(c, bucket_discard);
+ s->discarded++;
out:
- (*seen)++;
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@@ -1672,9 +1750,7 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1684,21 +1760,16 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded)));
-
- if (need_journal_commit * 2 > seen)
- bch2_journal_flush_async(&c->journal, NULL);
+ for_each_btree_key(trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ discard_buckets_next_dev(c, &s, NULL);
- trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -1760,7 +1831,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1795,22 +1866,18 @@ err:
static void bch2_do_invalidates_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
- struct bch_dev *ca;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned i;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
s64 nr_to_invalidate =
should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_INTENT, k,
@@ -1884,8 +1951,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1905,8 +1971,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1937,8 +2002,6 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
bool doing_init = false;
@@ -1947,7 +2010,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
* every mount:
*/
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@@ -2007,15 +2070,13 @@ out:
void bch2_recalc_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i;
lockdep_assert_held(&c->state_lock);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
@@ -2023,7 +2084,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
u64 dev_reserve = 0;
/*
@@ -2079,11 +2140,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
u64 ret = U64_MAX;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 73faf99a22..e7f7e842ee 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
}
+static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors + a.cached_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bch_alloc_v4 a)
+{
+ int d = bch2_bucket_sectors_dirty(a);
+
+ return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
if (!data_type_movable(a.data_type) ||
- a.dirty_sectors >= ca->mi.bucket_size)
+ !bch2_bucket_sectors_fragmented(ca, a))
return 0;
- return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+ u64 d = bch2_bucket_sectors_dirty(a);
+ return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
@@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 16, \
})
@@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.key_invalid = bch2_alloc_v4_invalid, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
})
@@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
-int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
new file mode 100644
index 0000000000..b4ec20be93
--- /dev/null
+++ b/fs/bcachefs/alloc_background_format.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0e61579826..633d3223b3 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
ca->alloc_cursor = 0;
rcu_read_unlock();
}
@@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- if (!c->blocked_allocate_open_bucket)
- c->blocked_allocate_open_bucket = local_clock();
-
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- if (c->blocked_allocate_open_bucket) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate_open_bucket],
- c->blocked_allocate_open_bucket);
- c->blocked_allocate_open_bucket = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, false);
- if (c->blocked_allocate) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate],
- c->blocked_allocate);
- c->blocked_allocate = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
if (!ob)
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
err:
- if (iter.trans && iter.path)
+ if (iter.path)
set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -447,7 +435,7 @@ again:
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
- citer.path->preserve = false;
+ set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -502,7 +490,7 @@ again:
ob = try_alloc_bucket(trans, ca, watermark,
alloc_cursor, s, k, cl);
if (ob) {
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
break;
}
}
@@ -567,8 +555,8 @@ again:
goto again;
}
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c,
bch_dev_bkey_exists(c, ob->dev)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
__clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- ? durability : 1;
+ *nr_effective += durability;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
@@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
devs = target_rw_devs(c, wp->data_type, target);
/* Don't allocate from devices we already have pointers to: */
- for (i = 0; i < devs_have->nr; i++)
- __clear_bit(devs_have->devs[i], devs.d);
+ darray_for_each(*devs_have, i)
+ __clear_bit(*i, devs.d);
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
@@ -1539,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
- atomic_read(&ob->pin),
- data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ atomic_read(&ob->pin));
+ bch2_prt_data_type(out, data_type);
+ prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 23c0834a97..569b97904d 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -3,6 +3,7 @@
#include "bbpos.h"
#include "alloc_background.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
@@ -67,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
- prt_str(out, " ");
+ if (bch2_dev_exists2(c, k.k->p.inode)) {
+ prt_str(out, "bucket=");
+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ prt_str(out, " ");
+ }
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
}
@@ -136,15 +139,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bkey_i_backpointer *bp_k,
+ struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
bool insert)
{
struct btree_iter bp_iter;
struct bkey_s_c k;
+ struct bkey_i_backpointer *bp_k;
int ret;
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bp_k->k.p,
BTREE_ITER_INTENT|
@@ -375,41 +393,45 @@ fsck_err:
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
-struct bpos_level {
- unsigned level;
- struct bpos pos;
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+ struct bpos bucket_start;
+ struct bpos bucket_end;
+ struct bkey_buf last_flushed;
};
static int check_bp_exists(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
- struct bkey_s_c orig_k,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
+ struct bkey_buf tmp;
int ret;
- if (bpos_lt(bucket, bucket_start) ||
- bpos_gt(bucket, bucket_end))
+ bch2_bkey_buf_init(&tmp);
+
+ if (bpos_lt(bucket, s->bucket_start) ||
+ bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@@ -424,13 +446,20 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (last_flushed->level != bp.level ||
- !bpos_eq(last_flushed->pos, orig_k.k->p)) {
- last_flushed->level = bp.level;
- last_flushed->pos = orig_k.k->p;
+ bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+
+ if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
+ if (bp.level) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
goto missing;
@@ -439,6 +468,7 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
missing:
@@ -448,8 +478,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
- c->opts.reconstruct_alloc ||
+ if (c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
@@ -457,25 +486,16 @@ missing:
}
static int check_extent_to_backpointers(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct extents_to_bp_state *s,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek_all_levels(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket_pos;
@@ -484,12 +504,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
+ ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@@ -498,47 +516,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree_id,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ int *level)
{
struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, btree_id);
struct btree_iter iter;
struct btree *b;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
int ret;
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+retry:
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+ 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
b = bch2_btree_iter_peek_node(&iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
- BUG_ON(b != btree_node_root(c, b));
-
- k = bkey_i_to_s_c(&b->key);
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
- struct bch_backpointer bp;
-
- if (p.ptr.cached)
- continue;
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry;
+ }
- bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
- k, p, &bucket_pos, &bp);
+ *level = b->c.level;
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
- if (ret)
- goto err;
- }
+ k = bkey_i_to_s_c(&b->key);
+ ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -559,7 +562,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, btree_bytes(c));
+ return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,49 +613,57 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct bpos bucket_start,
- struct bpos bucket_end)
+ struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- enum btree_id btree_id;
- struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
int ret = 0;
- for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
- unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+ for (enum btree_id btree_id = 0;
+ btree_id < btree_id_nr_alive(c);
+ btree_id++) {
+ int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- depth,
- BTREE_ITER_ALL_LEVELS|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent_to_backpointers(trans, &iter,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
- } while (!bch2_btree_iter_advance(&iter));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_btree_root_to_backpointers(trans, s, btree_id, &level));
+ if (ret)
+ return ret;
- bch2_trans_iter_exit(trans, &iter);
+ while (level >= depth) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ level,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+ ret = bkey_err(k) ?:
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret)
+ break;
+ if (bpos_eq(iter.pos, SPOS_MAX))
+ break;
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- break;
+ if (ret)
+ return ret;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_btree_root_to_backpointers(trans, btree_id,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
+ --level;
+ }
}
- return ret;
+
+ return 0;
}
static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
@@ -714,40 +725,45 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct bpos start = POS_MIN, end;
+ struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
+ bch2_bkey_buf_init(&s.last_flushed);
+ bkey_init(&s.last_flushed.k->k);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
- if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ if ( bpos_eq(s.bucket_start, POS_MIN) &&
+ !bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ if (!bpos_eq(s.bucket_start, POS_MIN) ||
+ !bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, start);
+ bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, end);
+ bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
- if (ret || bpos_eq(end, SPOS_MAX))
+ ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+ if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
- start = bpos_successor(end);
+ s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&s.last_flushed, c);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -801,13 +817,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = SPOS_MAX;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));
@@ -854,7 +868,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index ab866feeaf..327365a9fe 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
@@ -63,7 +64,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
@@ -72,28 +73,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_s_c orig_k,
bool insert)
{
- struct bch_fs *c = trans->c;
- struct bkey_i_backpointer *bp_k;
- int ret;
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
- ret = PTR_ERR_OR_ZERO(bp_k);
- if (ret)
- return ret;
+ struct bkey_i_backpointer bp_k;
- bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
- bp_k->v = bp;
+ bkey_backpointer_init(&bp_k.k_i);
+ bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k.v = bp;
if (!insert) {
- bp_k->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k->k, 0);
+ bp_k.k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k.k, 0);
}
- if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
-
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b62737fdf5..69d0d60d50 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -193,6 +193,7 @@
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
@@ -223,9 +224,11 @@
#define race_fault(...) dynamic_fault("bcachefs:race")
+#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
+
#define trace_and_count(_c, _name, ...) \
do { \
- this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ count_event(_c, _name); \
trace_##_name(__VA_ARGS__); \
} while (0)
@@ -262,46 +265,76 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+__printf(2, 3)
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c) _Generic((_c), \
+ struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
+ struct bch_fs *: (_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
- printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
- printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
- printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
- printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum(c, _inum, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev_ratelimited(ca, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+static inline bool should_print_err(int err)
+{
+ return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
+}
#define bch_err_fn(_c, _ret) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
} while (0)
+#define bch_err_fn_ratelimited(_c, _ret) \
+do { \
+ if (should_print_err(_ret)) \
+ bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
#define bch_err_msg(_c, _ret, _msg, ...) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error " _msg " %s", __func__, \
##__VA_ARGS__, bch2_err_str(_ret)); \
} while (0)
@@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_merge) \
x(btree_node_sort) \
x(btree_node_read) \
+ x(btree_node_read_done) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
@@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
- x(blocked_journal) \
+ x(blocked_journal_low_on_space) \
+ x(blocked_journal_low_on_pin) \
+ x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
+ x(blocked_write_buffer_full) \
x(nocow_lock_contended)
enum bch_time_stats {
@@ -428,6 +465,7 @@ enum bch_time_stats {
#include "replicas_types.h"
#include "subvolume_types.h"
#include "super_types.h"
+#include "thread_with_file_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -564,32 +602,35 @@ struct bch_dev {
struct io_count __percpu *io_done;
};
-enum {
- /* startup: */
- BCH_FS_STARTED,
- BCH_FS_MAY_GO_RW,
- BCH_FS_RW,
- BCH_FS_WAS_RW,
-
- /* shutdown: */
- BCH_FS_STOPPING,
- BCH_FS_EMERGENCY_RO,
- BCH_FS_GOING_RO,
- BCH_FS_WRITE_DISABLE_COMPLETE,
- BCH_FS_CLEAN_SHUTDOWN,
-
- /* fsck passes: */
- BCH_FS_FSCK_DONE,
- BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
- BCH_FS_NEED_ANOTHER_GC,
-
- BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
-
- /* errors: */
- BCH_FS_ERROR,
- BCH_FS_TOPOLOGY_ERROR,
- BCH_FS_ERRORS_FIXED,
- BCH_FS_ERRORS_NOT_FIXED,
+/*
+ * initial_gc_unfixed
+ * error
+ * topology error
+ */
+
+#define BCH_FS_FLAGS() \
+ x(started) \
+ x(may_go_rw) \
+ x(rw) \
+ x(was_rw) \
+ x(stopping) \
+ x(emergency_ro) \
+ x(going_ro) \
+ x(write_disable_complete) \
+ x(clean_shutdown) \
+ x(fsck_running) \
+ x(initial_gc_unfixed) \
+ x(need_another_gc) \
+ x(need_delete_dead_snapshots) \
+ x(error) \
+ x(topology_error) \
+ x(errors_fixed) \
+ x(errors_not_fixed)
+
+enum bch_fs_flags {
+#define x(n) BCH_FS_##n,
+ BCH_FS_FLAGS()
+#undef x
};
struct btree_debug {
@@ -599,10 +640,11 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
+ struct bch2_time_stats duration;
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
- unsigned wb_updates_size;
+ unsigned journal_entries_size;
unsigned max_mem;
char *max_paths_text;
};
@@ -664,7 +706,8 @@ struct btree_trans_buf {
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
- x(sysfs)
+ x(sysfs) \
+ x(btree_write_buffer)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -689,6 +732,8 @@ struct bch_fs {
struct super_block *vfs_sb;
dev_t dev;
char name[40];
+ struct stdio_redirect *stdio;
+ struct task_struct *stdio_filter;
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock;
@@ -699,6 +744,13 @@ struct bch_fs {
#else
struct percpu_ref writes;
#endif
+ /*
+ * Analagous to c->writes, for asynchronous ops that don't necessarily
+ * need fs to be read-write
+ */
+ refcount_t ro_ref;
+ wait_queue_head_t ro_ref_wait;
+
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
@@ -1002,10 +1054,21 @@ struct bch_fs {
/* RECOVERY */
u64 journal_replay_seq_start;
u64 journal_replay_seq_end;
+ /*
+ * Two different uses:
+ * "Has this fsck pass?" - i.e. should this type of error be an
+ * emergency read-only
+ * And, in certain situations fsck will rewind to an earlier pass: used
+ * for signaling to the toplevel code which pass we want to run now.
+ */
enum bch_recovery_pass curr_recovery_pass;
/* bitmap of explicitly enabled recovery passes: */
u64 recovery_passes_explicit;
+ /* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
+ /* never rewinds version of curr_recovery_pass */
+ enum bch_recovery_pass recovery_pass_done;
+ struct semaphore online_fsck_mutex;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
@@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
#endif
}
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget(&c->writes);
+#endif
+}
+
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
atomic_long_inc_not_zero(&c->writes[ref]);
#else
return percpu_ref_tryget_live(&c->writes);
@@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
if (atomic_long_read(&c->writes[i]))
return;
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
#else
percpu_ref_put(&c->writes);
#endif
}
+static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
+{
+ if (test_bit(BCH_FS_stopping, &c->flags))
+ return false;
+
+ return refcount_inc_not_zero(&c->ro_ref);
+}
+
+static inline void bch2_ro_ref_put(struct bch_fs *c)
+{
+ if (refcount_dec_and_test(&c->ro_ref))
+ wake_up(&c->ro_ref_wait);
+}
+
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
@@ -1117,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> 9;
-}
-
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);
@@ -1158,6 +1240,27 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
return dev < c->sb.nr_devices && c->devs[dev];
}
+static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+ return stdio;
+}
+
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.metadata_replicas,
+ c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.data_replicas,
+ c->opts.data_replicas_required);
+}
+
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index fe78e87603..0668b682a2 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -307,6 +307,13 @@ struct bkey_i {
struct bch_val v;
};
+#define POS_KEY(_pos) \
+((struct bkey) { \
+ .u64s = BKEY_U64s, \
+ .format = KEY_FORMAT_CURRENT, \
+ .p = _pos, \
+})
+
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
@@ -410,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX 4096
-
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
- __u8 d_type;
-
- __u8 d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- __u64 fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
-
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@@ -1014,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
- __u8 pad;
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- */
- __le32 parent;
- __le32 pad;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-/* Snapshots */
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
/* LRU btree: */
struct bch_lru {
@@ -1171,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1223,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@@ -1296,6 +553,7 @@ struct bch_member {
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
+ __le64 seq;
};
#define BCH_MEMBER_V1_BYTES 56
@@ -1442,7 +700,7 @@ struct bch_sb_field_replicas_v0 {
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
@@ -1454,24 +712,7 @@ struct bch_replicas_entry {
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
+ struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
@@ -1492,99 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -1662,69 +810,41 @@ struct bch_sb_field_downgrade {
#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
-#define RECOVERY_PASS_ALL_FSCK (1ULL << 63)
-
/*
* field 1: version name
* field 2: BCH_VERSION(major, minor)
* field 3: recovery passess required on upgrade
*/
#define BCH_METADATA_VERSIONS() \
- x(bkey_renumber, BCH_VERSION(0, 10), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_btree_change, BCH_VERSION(0, 11), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot, BCH_VERSION(0, 12), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_backpointers, BCH_VERSION(0, 13), \
- RECOVERY_PASS_ALL_FSCK) \
- x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_2, BCH_VERSION(0, 15), \
- BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \
- BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(reflink_p_fix, BCH_VERSION(0, 16), \
- BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \
- x(subvol_dirent, BCH_VERSION(0, 17), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v2, BCH_VERSION(0, 18), \
- RECOVERY_PASS_ALL_FSCK) \
- x(freespace, BCH_VERSION(0, 19), \
- RECOVERY_PASS_ALL_FSCK) \
- x(alloc_v4, BCH_VERSION(0, 20), \
- RECOVERY_PASS_ALL_FSCK) \
- x(new_data_types, BCH_VERSION(0, 21), \
- RECOVERY_PASS_ALL_FSCK) \
- x(backpointers, BCH_VERSION(0, 22), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v3, BCH_VERSION(0, 23), \
- RECOVERY_PASS_ALL_FSCK) \
- x(unwritten_extents, BCH_VERSION(0, 24), \
- RECOVERY_PASS_ALL_FSCK) \
- x(bucket_gens, BCH_VERSION(0, 25), \
- BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(lru_v2, BCH_VERSION(0, 26), \
- RECOVERY_PASS_ALL_FSCK) \
- x(fragmentation_lru, BCH_VERSION(0, 27), \
- RECOVERY_PASS_ALL_FSCK) \
- x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_trees, BCH_VERSION(0, 29), \
- RECOVERY_PASS_ALL_FSCK) \
- x(major_minor, BCH_VERSION(1, 0), \
- 0) \
- x(snapshot_skiplists, BCH_VERSION(1, 1), \
- BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
- x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
- x(rebalance_work, BCH_VERSION(1, 3), \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ x(bkey_renumber, BCH_VERSION(0, 10)) \
+ x(inode_btree_change, BCH_VERSION(0, 11)) \
+ x(snapshot, BCH_VERSION(0, 12)) \
+ x(inode_backpointers, BCH_VERSION(0, 13)) \
+ x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
+ x(snapshot_2, BCH_VERSION(0, 15)) \
+ x(reflink_p_fix, BCH_VERSION(0, 16)) \
+ x(subvol_dirent, BCH_VERSION(0, 17)) \
+ x(inode_v2, BCH_VERSION(0, 18)) \
+ x(freespace, BCH_VERSION(0, 19)) \
+ x(alloc_v4, BCH_VERSION(0, 20)) \
+ x(new_data_types, BCH_VERSION(0, 21)) \
+ x(backpointers, BCH_VERSION(0, 22)) \
+ x(inode_v3, BCH_VERSION(0, 23)) \
+ x(unwritten_extents, BCH_VERSION(0, 24)) \
+ x(bucket_gens, BCH_VERSION(0, 25)) \
+ x(lru_v2, BCH_VERSION(0, 26)) \
+ x(fragmentation_lru, BCH_VERSION(0, 27)) \
+ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
+ x(snapshot_trees, BCH_VERSION(0, 29)) \
+ x(major_minor, BCH_VERSION(1, 0)) \
+ x(snapshot_skiplists, BCH_VERSION(1, 1)) \
+ x(deleted_inodes, BCH_VERSION(1, 2)) \
+ x(rebalance_work, BCH_VERSION(1, 3)) \
+ x(member_seq, BCH_VERSION(1, 4))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
-#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n,
+#define x(t, n) bcachefs_metadata_version_##t = n,
BCH_METADATA_VERSIONS()
#undef x
bcachefs_metadata_version_max
@@ -1786,7 +906,8 @@ struct bch_sb {
__le32 time_base_hi;
__le32 time_precision;
- __le64 flags[8];
+ __le64 flags[7];
+ __le64 write_time;
__le64 features[2];
__le64 compat[2];
@@ -2153,7 +1274,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(clock, 7) \
x(dev_usage, 8) \
x(log, 9) \
- x(overwrite, 10)
+ x(overwrite, 10) \
+ x(write_buffer_keys, 11)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -2162,6 +1284,19 @@ enum {
BCH_JSET_ENTRY_NR
};
+static inline bool jset_entry_is_key(struct jset_entry *e)
+{
+ switch (e->type) {
+ case BCH_JSET_ENTRY_btree_keys:
+ case BCH_JSET_ENTRY_btree_root:
+ case BCH_JSET_ENTRY_overwrite:
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ return true;
+ }
+
+ return false;
+}
+
/*
* Journal sequence numbers can be blacklisted: bsets record the max sequence
* number of all the journal entries they contain updates for, so that on
@@ -2203,7 +1338,7 @@ struct jset_entry_usage {
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct jset_entry_clock {
@@ -2224,8 +1359,8 @@ struct jset_entry_dev_usage {
__le32 dev;
__u32 pad;
- __le64 buckets_ec;
- __le64 _buckets_unavailable; /* No longer used */
+ __le64 _buckets_ec; /* No longer used */
+ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[];
};
@@ -2239,7 +1374,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage
struct jset_entry_log {
struct jset_entry entry;
u8 d[];
-} __packed;
+} __packed __aligned(8);
/*
* On disk format for a journal entry:
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index f05881f7e1..4b8fba754b 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -81,6 +81,11 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
+#define BCH_DATA_OPS() \
+ x(scrub, 0) \
+ x(rereplicate, 1) \
+ x(migrate, 2) \
+ x(rewrite_old_nodes, 3) \
+ x(drop_extra_replicas, 4)
+
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_REWRITE_OLD_NODES = 3,
- BCH_DATA_OP_NR = 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+ BCH_DATA_OPS()
+#undef x
+ BCH_DATA_OP_NR
};
/*
@@ -237,7 +248,7 @@ struct bch_ioctl_data_event {
struct bch_replicas_usage {
__u64 sectors;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
static inline struct bch_replicas_usage *
@@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage {
__u32 replica_entries_bytes;
__u32 pad;
- struct bch_replicas_usage replicas[0];
+ struct bch_replicas_usage replicas[];
};
/*
@@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage {
__u64 buckets;
__u64 sectors;
__u64 fragmented;
- } d[BCH_DATA_NR];
+ } d[10];
+};
+
+struct bch_ioctl_dev_usage_v2 {
+ __u64 dev;
+ __u32 flags;
+ __u8 state;
+ __u8 nr_data_types;
+ __u8 pad[6];
+
+ __u32 bucket_size;
+ __u64 nr_buckets;
+
+ struct bch_ioctl_dev_usage_type d[];
};
/*
@@ -365,4 +389,24 @@ struct bch_ioctl_subvolume {
#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+/*
+ * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_offline {
+ __u64 flags;
+ __u64 opts; /* string */
+ __u64 nr_devs;
+ __u64 devs[] __counted_by(nr_devs);
+};
+
+/*
+ * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_online {
+ __u64 flags;
+ __u64 opts; /* string */
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abdb05507d..76e79a15ba 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
- bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+ bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 761f5e33b1..5e52684764 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+ prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
+ .val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3a370b7087..03efe8ee56 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -28,10 +28,8 @@ struct bkey_ops {
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
- int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
- int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -78,84 +76,88 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-static inline int bch2_mark_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
- return ops->atomic_trigger
- ? ops->atomic_trigger(trans, btree, level, old, new, flags)
- : 0;
-}
-
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
+ __BTREE_TRIGGER_NORUN,
+ __BTREE_TRIGGER_TRANSACTIONAL,
+ __BTREE_TRIGGER_ATOMIC,
+ __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
-
- __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+/* Don't run triggers at all */
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+/*
+ * If set, we're running transactional triggers as part of a transaction commit:
+ * triggers may generate new updates
+ *
+ * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
+ * we're running atomic triggers during a transaction commit: we have our
+ * journal reservation, we're holding btree node write locks, and we know the
+ * transaction is going to commit (returning an error here is a fatal error,
+ * causing us to go emergency read-only)
+ */
+#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+
+/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+
+/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-static inline int bch2_trans_mark_key(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static inline int bch2_key_trigger(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
- return ops->trans_trigger
- ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ return ops->trigger
+ ? ops->trigger(trans, btree, level, old, new, flags)
: 0;
}
-static inline int bch2_trans_mark_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+static inline int bch2_key_trigger_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
deleted.k.p = old.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
+ BTREE_TRIGGER_OVERWRITE|flags);
}
-static inline int bch2_trans_mark_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_i *new, unsigned flags)
+static inline int bch2_key_trigger_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
- deleted.k.p = new->k.p;
+ deleted.k.p = new.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index bb73ba9017..3fd1085b6c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
_k = _n) {
_n = bkey_p_next(_k);
+ if (!_k->u64s) {
+ printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
+ _k->_data - i->_data);
+ break;
+ }
+
k = bkey_disassemble(b, _k, &uk);
printbuf_reset(&buf);
@@ -714,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
- unsigned j, cacheline = 1;
+ unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@@ -817,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
- struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
- BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 632c2b8c54..79c77baaa3 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
- struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 79495cd7a7..d7c81beac1 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_bytes(c), gfp);
+ b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(btree_bytes(c));
+ b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, btree_bytes(c));
+ kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, trans);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
@@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
@@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_and_count(c, btree_cache_cannibalize_lock, c);
+ trace_and_count(c, btree_cache_cannibalize_lock, trans);
return 0;
}
@@ -673,7 +675,7 @@ err:
mutex_unlock(&bc->lock);
- trace_and_count(c, btree_cache_cannibalize, c);
+ trace_and_count(c, btree_cache_cannibalize, trans);
goto out;
}
@@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (IS_ERR(b))
return b;
- /*
- * Btree nodes read in from disk should not have the accessed bit set
- * initially, so that linear scans don't thrash the cache:
- */
- clear_btree_node_accessed(b);
-
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
@@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (path && sync)
bch2_trans_unlock_noassert(trans);
- bch2_btree_node_read(c, b, sync);
+ bch2_btree_node_read(trans, b, sync);
if (!sync)
return NULL;
@@ -1039,7 +1035,7 @@ retry:
goto retry;
if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(c, NULL))
+ !bch2_btree_cache_cannibalize_lock(trans, NULL))
goto retry;
if (IS_ERR(b))
@@ -1087,7 +1083,7 @@ lock_node:
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
out:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return b;
}
@@ -1196,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
+ btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index cfb80b201d..6d33885fdb 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
{
- return c->opts.btree_node_size;
+ return 1UL << b->byte_order;
}
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+ return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
{
- return btree_bytes(c) / PAGE_SIZE;
+ return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 30ab78a245..1102995643 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -41,6 +41,14 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
+{
+ return (struct bkey_s) {{{
+ (struct bkey *) k.k,
+ (struct bch_val *) k.v
+ }}};
+}
+
static bool should_restart_for_topology_repair(struct bch_fs *c)
{
return c->opts.fix_errors != FSCK_FIX_no &&
@@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
}
@@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
@@ -414,10 +422,9 @@ again:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
break;
- }
ret = btree_repair_node_boundaries(c, b, prev, cur);
@@ -482,10 +489,9 @@ again:
false);
ret = PTR_ERR_OR_ZERO(cur);
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
goto err;
- }
ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
@@ -591,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -609,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
g->data_type = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -631,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -643,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -658,13 +664,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->data_type],
- bch2_data_types[data_type],
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
g->data_type = data_type;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
- bch_err_msg(c, ret, "allocating new key");
ret = -BCH_ERR_ENOMEM_gc_repair_key;
+ bch_err_msg(c, ret, "allocating new key");
goto err;
}
@@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- unsigned flags =
- BTREE_TRIGGER_GC|
- (initial ? BTREE_TRIGGER_NOATOMIC : 0);
int ret = 0;
deleted.p = k->k->p;
@@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
ret = commit_do(trans, NULL, NULL, 0,
- bch2_mark_key(trans, btree_id, level, old, *k, flags));
+ bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
/* Continue marking when opted to not
* fix the error: */
ret = 0;
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
continue;
}
} else if (ret) {
@@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
fsck_err:
six_unlock_read(&b->c.lock);
- if (ret < 0)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
printbuf_exit(&buf);
return ret;
}
@@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
: bch2_gc_btree(trans, i, initial, metadata_only);
}
- if (ret < 0)
- bch_err_fn(c, ret);
-
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
static void bch2_mark_superblocks(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
mutex_unlock(&c->sb_lock);
}
@@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
@@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c,
bool verify = !metadata_only &&
!c->opts.reconstruct_alloc &&
(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
- unsigned i, dev;
+ unsigned i;
int ret = 0;
percpu_down_write(&c->mark_lock);
@@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c,
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for_each_member_device(ca, c, dev) {
+ __for_each_member_device(c, ca) {
struct bch_dev_usage *dst = ca->usage_base;
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
@@ -1245,15 +1238,12 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_types[i]);
+ d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_types[i]);
+ d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
-
- copy_dev_field(dev_usage_buckets_ec_wrong,
- buckets_ec, "buckets_ec");
}
{
@@ -1263,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
- hidden, "hidden");
+ b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
- btree, "btree");
+ b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
- data, "data");
+ b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
- cached, "cached");
+ b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
- reserved, "reserved");
+ b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
- nr_inodes,"nr_inodes");
+ b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (metadata_only &&
@@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c,
fsck_err:
if (ca)
percpu_ref_put(&ca->ref);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
percpu_up_write(&c->mark_lock);
printbuf_exit(&buf);
@@ -1317,9 +1306,6 @@ fsck_err:
static int bch2_gc_start(struct bch_fs *c)
{
- struct bch_dev *ca = NULL;
- unsigned i;
-
BUG_ON(c->usage_gc);
c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
@@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c)
return -BCH_ERR_ENOMEM_gc_start;
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
BUG_ON(ca->usage_gc);
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c)
static int bch2_gc_reset(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
}
@@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
enum bch_data_type type;
int ret;
- if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
- return 1;
-
old = bch2_alloc_to_v4(k, &old_convert);
new = *old;
@@ -1437,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
- bch2_data_types[new.data_type],
- bch2_data_types[gc.data_type]))
+ bch2_data_type_str(new.data_type),
+ bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@@ -1448,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
- bch2_data_types[gc.data_type], \
+ bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \
@@ -1488,52 +1468,36 @@ fsck_err:
static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
- for_each_member_device(ca, c, i) {
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_alloc_write_key(trans, &iter, k, metadata_only));
-
- if (ret < 0) {
- bch_err_fn(c, ret);
+ for_each_member_device(c, ca) {
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ POS(ca->dev_idx, ca->mi.nbuckets - 1),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ if (ret) {
percpu_ref_put(&ca->ref);
break;
}
}
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
+ bch_err_fn(c, ret);
+ return ret;
}
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bucket *g;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- unsigned i;
- int ret;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
percpu_ref_put(&ca->ref);
bch_err(c, "error allocating ca->buckets[gc]");
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
- goto err;
+ return -BCH_ERR_ENOMEM_gc_alloc_start;
}
buckets->first_bucket = ca->mi.first_bucket;
@@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
- ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = gc_bucket(ca, k.k->p.offset);
-
- a = bch2_alloc_to_v4(k, &a_convert);
-
- g->gen_valid = 1;
- g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
- 0;
- }));
-err:
- bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+ g->gen_valid = 1;
+ g->gen = a->gen;
+
+ if (metadata_only &&
+ (a->data_type == BCH_DATA_user ||
+ a->data_type == BCH_DATA_cached ||
+ a->data_type == BCH_DATA_parity)) {
+ g->data_type = a->data_type;
+ g->dirty_sectors = a->dirty_sectors;
+ g->cached_sectors = a->cached_sectors;
+ g->stripe = a->stripe;
+ g->stripe_redundancy = a->stripe_redundancy;
+ }
+
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = gc_bucket_array(ca);
struct bucket *g;
@@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
else
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
}
fsck_err:
printbuf_exit(&buf);
@@ -1643,64 +1603,52 @@ fsck_err:
static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx));
-
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
c->reflink_gc_nr = 0;
- bch2_trans_put(trans);
return ret;
}
static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct reflink_gc *r;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
c->reflink_gc_nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ const __le64 *refcount = bkey_refcount_c(k);
- if (!refcount)
- continue;
+ if (!refcount)
+ continue;
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+ c->reflink_gc_nr++, GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- }
- bch2_trans_iter_exit(trans, &iter);
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ 0;
+ })));
- bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1768,24 +1716,15 @@ fsck_err:
static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_stripes_key(trans, &iter, k));
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_stripes_key(trans, &iter, k)));
}
static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
@@ -1848,7 +1787,7 @@ again:
#endif
c->gc_count++;
- if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
@@ -1860,7 +1799,7 @@ again:
* XXX: make sure gens we fixed got saved
*/
bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ clear_bit(BCH_FS_need_another_gc, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
bch2_gc_stripes_reset(c, metadata_only);
@@ -1900,9 +1839,7 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bkey_i *u;
int ret;
@@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
int bch2_gc_gens(struct bch_fs *c)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
u64 b, start_time = local_clock();
- unsigned i;
int ret;
/*
@@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c)
trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
- trans = bch2_trans_get(c);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
@@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen[b] = gens->b[b];
}
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
if (btree_type_has_ptrs(i)) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = for_each_btree_key_commit(trans, iter, i,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- gc_btree_gens_key(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ gc_btree_gens_key(trans, &iter, k)));
if (ret)
goto err;
}
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN,
- BTREE_ITER_PREFETCH,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_alloc_write_oldest_gen(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_alloc_write_oldest_gen(trans, &iter, k)));
if (ret)
goto err;
@@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvfree(ca->oldest_gen);
ca->oldest_gen = NULL;
}
- bch2_trans_put(trans);
up_read(&c->gc_lock);
mutex_unlock(&c->gc_gens_lock);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
return ret;
}
@@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
- int ret;
set_freezable();
@@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg)
#if 0
ret = bch2_gc(c, false, false);
#else
- ret = bch2_gc_gens(c);
+ bch2_gc_gens(c);
#endif
- if (ret < 0)
- bch_err_fn(c, ret);
-
debug_check_no_locks_held();
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5a720f0cd5..aa9b6cbe32 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(size > btree_bytes(c));
+ BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
- for (k = unwritten_whiteouts_start(c, b);
- k != unwritten_whiteouts_end(c, b);
+ for (k = unwritten_whiteouts_start(b);
+ k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
- memcpy_u64s(unwritten_whiteouts_start(c, b),
+ memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
- ? btree_bytes(c)
+ ? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(bytes != btree_bytes(c));
+ BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u/%u",
+ b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
prt_str(out, ": ");
@@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
+static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+ struct bset *i, struct bkey_packed *k)
+{
+ if (bkey_p_next(k) > vstruct_last(i))
+ return false;
+
+ if (k->format > KEY_FORMAT_CURRENT)
+ return false;
+
+ struct printbuf buf = PRINTBUF;
+ struct bkey tmp;
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
bool have_retry, bool *saw_error)
@@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
+ unsigned next_good_key;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
@@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_format,
- "invalid bkey format %u", k->format)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "invalid bkey format %u", k->format))
+ goto drop_this_key;
/* XXX: validate k->u64s */
if (!write)
@@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
c, NULL, b, i,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
+ goto drop_this_key;
}
if (write)
@@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
prt_printf(&buf, " > ");
bch2_bkey_to_text(&buf, u.k);
- bch2_dump_bset(c, b, i, 0);
-
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_out_of_order,
- "%s", buf.buf)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "%s", buf.buf))
+ goto drop_this_key;
}
prev = k;
k = bkey_p_next(k);
+ continue;
+drop_this_key:
+ next_good_key = k->u64s;
+
+ if (!next_good_key ||
+ (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
+ version >= bcachefs_metadata_version_snapshot)) {
+ /*
+ * only do scanning if bch2_bkey_compat() has nothing to
+ * do
+ */
+
+ if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ for (next_good_key = 1;
+ next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
+ next_good_key++)
+ if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ goto got_good_key;
+
+ }
+
+ /*
+ * didn't find a good key, have to truncate the rest of
+ * the bset
+ */
+ next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
+ }
+got_good_key:
+ le16_add_cpu(&i->u64s, -next_good_key);
+ memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
}
fsck_err:
printbuf_exit(&buf);
@@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
- struct bch_extent_ptr *ptr;
struct bset *i;
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
+ u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
/* We might get called multiple times on read retry: */
@@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
- "got wrong btree node (seq %llx want %llx)",
- b->data->keys.seq, bp->seq);
+ "got wrong btree node (want %llx got %llx)\n"
+ "got btree %s level %llu pos %s",
+ bp->seq, b->data->keys.seq,
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
+ BTREE_NODE_LEVEL(b->data),
+ buf.buf);
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
@@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(b->data->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+ csum_bad = bch2_crc_cmp(b->data->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(bne->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ csum_bad = bch2_crc_cmp(bne->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1111,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
+ bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@@ -1123,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
- sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1139,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1234,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1462,7 +1512,7 @@ fsck_err:
}
if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_bytes(c));
+ memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@@ -1528,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_bytes(c)),
+ buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1548,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
return 0;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bool sync)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
- trace_and_count(c, btree_node_read, c, b);
+ trace_and_count(c, btree_node_read, trans, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1614,7 +1665,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_bytes(c)),
+ buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1628,7 +1679,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_bytes(c));
+ bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) {
submit_bio_wait(bio);
-
+ bch2_latency_acct(ca, rb->start_time, READ);
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
@@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
BUG_ON(IS_ERR(b));
@@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
- bch2_btree_node_read(c, b, true);
+ bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work)
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+ ret = -BCH_ERR_btree_write_all_failed;
goto err;
+ }
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
@@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW,
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
@@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
- struct bch_extent_ptr *ptr;
BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
bkey_copy(&tmp.k, &wbio->key);
@@ -2022,8 +2074,8 @@ do_write:
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_start(b),
+ unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@@ -2199,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e0d7fa5b1d..e251cb6b96 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool, bool *);
-void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 816ecc3375..3ef338df82 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -13,6 +13,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
#include "trace.h"
@@ -21,8 +22,8 @@
#include <linux/prefetch.h>
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
- struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *,
+ btree_path_idx_t, btree_path_idx_t);
static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
{
@@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
#endif
}
-static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
+static void bch2_trans_srcu_lock(struct btree_trans *);
static inline int __btree_path_cmp(const struct btree_path *l,
enum btree_id r_btree_id,
@@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
void bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned iter;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, iter)
bch2_btree_path_verify(trans, path);
}
@@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(iter->btree_id >= BTREE_ID_NR);
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
@@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, iter->update_path);
- bch2_btree_path_verify(trans, iter->path);
+ bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache)
{
struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
struct printbuf buf = PRINTBUF;
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx) {
+ trans_for_each_path_inorder(trans, path, iter) {
int cmp = cmp_int(path->btree_id, id) ?:
cmp_int(path->cached, key_cache);
@@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct bkey_packed *where)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path) {
+ trans_for_each_path_with_node(trans, b, path, i) {
__bch2_btree_path_fix_key_modified(path, b, where);
bch2_btree_path_verify_level(trans, path, b->c.level);
}
@@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
{
struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
struct btree_path *linked;
+ unsigned i;
if (node_iter != &path->l[b->c.level].iter) {
__bch2_btree_node_iter_fix(path, b, node_iter, t,
@@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
bch2_btree_node_iter_verify(node_iter, b);
}
- trans_for_each_path_with_node(trans, b, linked) {
+ trans_for_each_path_with_node(trans, b, linked, i) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
@@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!i->cached &&
@@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->btree_id == b->c.btree_id &&
bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
- i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+ i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- struct btree_path *path;
+ struct btree_path *prev;
+
+ BUG_ON(!btree_path_pos_in_node(path, b));
+
+ while ((prev = prev_btree_path(trans, path)) &&
+ btree_path_pos_in_node(prev, b))
+ path = prev;
- trans_for_each_path(trans, path)
- if (path->uptodate == BTREE_ITER_UPTODATE &&
- !path->cached &&
- btree_path_pos_in_node(path, b)) {
+ for (;
+ path && btree_path_pos_in_node(path, b);
+ path = next_btree_path(trans, path))
+ if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
@@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path)
+ trans_for_each_path_with_node(trans, b, path, i)
__btree_path_level_init(path, b->c.level);
bch2_trans_revalidate_updates_in_node(trans, b);
@@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
bch2_bkey_buf_reassemble(out, c, k);
- if (flags & BTREE_ITER_PREFETCH)
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
bch2_btree_and_journal_iter_exit(&jiter);
@@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- if (flags & BTREE_ITER_PREFETCH) {
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
goto err;
@@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_path *path;
unsigned long trace_ip = _RET_IP_;
- int i, ret = 0;
+ unsigned i;
+ int ret = 0;
if (trans->in_traverse_all)
return -BCH_ERR_transaction_restart_in_traverse_all;
@@ -963,7 +979,7 @@ retry_all:
trans->restarted = 0;
trans->last_restarted_ip = 0;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
path->should_be_locked = false;
btree_trans_sort_paths(trans);
@@ -977,7 +993,7 @@ retry_all:
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
}
@@ -985,16 +1001,16 @@ retry_all:
/* Now, redo traversals in correct order: */
i = 0;
while (i < trans->nr_sorted) {
- path = trans->paths + trans->sorted[i];
+ btree_path_idx_t idx = trans->sorted[i];
/*
* Traversing a path can cause another path to be added at about
* the same position:
*/
- if (path->uptodate) {
- __btree_path_get(path, false);
- ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- __btree_path_put(path, false);
+ if (trans->paths[idx].uptodate) {
+ __btree_path_get(&trans->paths[idx], false);
+ ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
+ __btree_path_put(&trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1013,7 +1029,7 @@ retry_all:
* then failed to relock a path - that's fine.
*/
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
trans->in_traverse_all = false;
@@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
* stashed in the iterator and returned from bch2_trans_exit().
*/
int bch2_btree_path_traverse_one(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned flags,
unsigned long trace_ip)
{
+ struct btree_path *path = &trans->paths[path_idx];
unsigned depth_want = path->level;
int ret = -((int) trans->restarted);
@@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
+ path = &trans->paths[path_idx];
+
if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
@@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
}
-static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
- bool intent)
+static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
+ bool intent)
{
- struct btree_path *new = btree_path_alloc(trans, src);
-
- btree_path_copy(trans, new, src);
- __btree_path_get(new, intent);
+ btree_path_idx_t new = btree_path_alloc(trans, src);
+ btree_path_copy(trans, trans->paths + new, trans->paths + src);
+ __btree_path_get(trans->paths + new, intent);
return new;
}
__flatten
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
- unsigned long ip)
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
+ btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(path, intent);
+ __btree_path_put(trans->paths + path, intent);
path = btree_path_clone(trans, path, intent);
- path->preserve = false;
+ trans->paths[path].preserve = false;
return path;
}
-struct btree_path * __must_check
+btree_path_idx_t __must_check
__bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip, int cmp)
+ btree_path_idx_t path_idx, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- unsigned level = path->level;
+ int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(!path->ref);
+ EBUG_ON(!trans->paths[path_idx].ref);
- path = bch2_btree_path_make_mut(trans, path, intent, ip);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
+ struct btree_path *path = trans->paths + path_idx;
path->pos = new_pos;
trans->paths_sorted = false;
@@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
@@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
out:
bch2_btree_path_verify(trans, path);
- return path;
+ return path_idx;
}
/* Btree path: main interface: */
@@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
return NULL;
}
-static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
{
- __bch2_btree_path_unlock(trans, path);
- btree_path_list_remove(trans, path);
- trans->paths_allocated &= ~(1ULL << path->idx);
+ __bch2_btree_path_unlock(trans, trans->paths + path);
+ btree_path_list_remove(trans, trans->paths + path);
+ __clear_bit(path, trans->paths_allocated);
}
-void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *dup;
-
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
+ struct btree_path *path = trans->paths + path_idx, *dup;
if (!__btree_path_put(path, intent))
return;
@@ -1322,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
if (path->should_be_locked &&
!trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {
@@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
dup->should_be_locked |= path->should_be_locked;
}
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, path_idx);
}
-static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
-
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
prt_newline(buf);
@@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- trans_for_each_wb_update(trans, wb) {
- prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_id_str(wb->btree),
- (void *) i->ip_allocated);
- prt_newline(buf);
-
- prt_printf(buf, " new ");
- bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
- prt_newline(buf);
- }
+ for (struct jset_entry *e = trans->journal_entries;
+ e != btree_trans_journal_entries_top(trans);
+ e = vstruct_next(e))
+ bch2_journal_entry_to_text(buf, trans->c, e);
printbuf_indent_sub(buf, 2);
}
@@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
printbuf_exit(&buf);
}
-noinline __cold
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
+ struct btree_path *path = trans->paths + path_idx;
+
prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
- path->idx, path->ref, path->intent_ref,
+ path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
bch2_btree_id_str(path->btree_id),
@@ -1434,14 +1438,13 @@ static noinline __cold
void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
bool nosort)
{
- struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx)
- bch2_btree_path_to_text(out, path);
+ trans_for_each_path_idx_inorder(trans, iter)
+ bch2_btree_path_to_text(out, trans, iter.path_idx);
}
noinline __cold
@@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
{
struct btree_transaction_stats *s = btree_trans_stats(trans);
struct printbuf buf = PRINTBUF;
-
- if (!s)
- return;
+ size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
bch2_trans_paths_to_text(&buf, trans);
if (!buf.allocation_failure) {
mutex_lock(&s->lock);
- if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
- s->nr_max_paths = trans->nr_max_paths =
- hweight64(trans->paths_allocated);
+ if (nr > s->nr_max_paths) {
+ s->nr_max_paths = nr;
swap(s->max_paths_text, buf.buf);
}
mutex_unlock(&s->lock);
@@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
printbuf_exit(&buf);
- trans->nr_max_paths = hweight64(trans->paths_allocated);
+ trans->nr_paths_max = nr;
+}
+
+noinline __cold
+int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
+{
+ if (trace_trans_restart_too_many_iters_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_paths_to_text(&buf, trans);
+ trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_too_many_iters);
+
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
static noinline void btree_path_overflow(struct btree_trans *trans)
{
bch2_dump_trans_paths_updates(trans);
- panic("trans path overflow\n");
+ bch_err(trans->c, "trans path overflow");
}
-static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
- struct btree_path *pos)
+static noinline void btree_paths_realloc(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned idx;
+ unsigned nr = trans->nr_paths * 2;
+
+ void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ sizeof(struct btree_trans_paths) +
+ nr * sizeof(struct btree_path) +
+ nr * sizeof(btree_path_idx_t) + 8 +
+ nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
+
+ unsigned long *paths_allocated = p;
+ memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
+ p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
+
+ p += sizeof(struct btree_trans_paths);
+ struct btree_path *paths = p;
+ *trans_paths_nr(paths) = nr;
+ memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
+ p += nr * sizeof(struct btree_path);
+
+ btree_path_idx_t *sorted = p;
+ memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
+ p += nr * sizeof(btree_path_idx_t) + 8;
+
+ struct btree_insert_entry *updates = p;
+ memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
+
+ unsigned long *old = trans->paths_allocated;
- if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
- btree_path_overflow(trans);
+ rcu_assign_pointer(trans->paths_allocated, paths_allocated);
+ rcu_assign_pointer(trans->paths, paths);
+ rcu_assign_pointer(trans->sorted, sorted);
+ rcu_assign_pointer(trans->updates, updates);
- idx = __ffs64(~trans->paths_allocated);
+ trans->nr_paths = nr;
+
+ if (old != trans->_paths_allocated)
+ kfree_rcu_mightsleep(old);
+}
+
+static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
+ btree_path_idx_t pos)
+{
+ btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
+
+ if (unlikely(idx == trans->nr_paths)) {
+ if (trans->nr_paths == BTREE_ITER_MAX) {
+ btree_path_overflow(trans);
+ return 0;
+ }
+
+ btree_paths_realloc(trans);
+ }
/*
* Do this before marking the new path as allocated, since it won't be
* initialized yet:
*/
- if (unlikely(idx > trans->nr_max_paths))
+ if (unlikely(idx > trans->nr_paths_max))
bch2_trans_update_max_paths(trans);
- trans->paths_allocated |= 1ULL << idx;
+ __set_bit(idx, trans->paths_allocated);
- path = &trans->paths[idx];
- path->idx = idx;
+ struct btree_path *path = &trans->paths[idx];
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->alloc_seq++;
- btree_path_list_add(trans, pos, path);
+ btree_path_list_add(trans, pos, idx);
trans->paths_sorted = false;
- return path;
+ return idx;
}
-struct btree_path *bch2_path_get(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned level,
- unsigned flags, unsigned long ip)
+btree_path_idx_t bch2_path_get(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ unsigned flags, unsigned long ip)
{
- struct btree_path *path, *path_pos = NULL;
+ struct btree_path *path;
bool cached = flags & BTREE_ITER_CACHED;
bool intent = flags & BTREE_ITER_INTENT;
- int i;
+ struct trans_for_each_path_inorder_iter iter;
+ btree_path_idx_t path_pos = 0, path_idx;
bch2_trans_verify_not_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
cached,
@@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
level) > 0)
break;
- path_pos = path;
+ path_pos = iter.path_idx;
}
if (path_pos &&
- path_pos->cached == cached &&
- path_pos->btree_id == btree_id &&
- path_pos->level == level) {
- __btree_path_get(path_pos, intent);
- path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ trans->paths[path_pos].cached == cached &&
+ trans->paths[path_pos].btree_id == btree_id &&
+ trans->paths[path_pos].level == level) {
+ __btree_path_get(trans->paths + path_pos, intent);
+ path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = trans->paths + path_idx;
} else {
- path = btree_path_alloc(trans, path_pos);
- path_pos = NULL;
+ path_idx = btree_path_alloc(trans, path_pos);
+ path = trans->paths + path_idx;
__btree_path_get(path, intent);
path->pos = pos;
@@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef TRACK_PATH_ALLOCATED
path->ip_allocated = ip;
@@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
- return path;
+ return path_idx;
}
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
@@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter)
int __must_check
bch2_btree_iter_traverse(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
int ret;
- iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(trans->paths + iter->path);
return 0;
}
@@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
struct btree *b = NULL;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_btree_iter_verify(iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
- b = btree_path_node(iter->path, iter->path->level);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ b = btree_path_node(path, path->level);
if (!b)
goto out;
@@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- struct btree_path *path = iter->path;
struct btree *b = NULL;
int ret;
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
/* already at end? */
if (!btree_path_node(path, path->level))
return NULL;
@@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- path = iter->path =
- bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
+ bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ path = btree_iter_path(trans, iter);
btree_path_set_level_down(trans, path, iter->min_depth);
- ret = bch2_btree_path_traverse(trans, path, iter->flags);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
+ path = btree_iter_path(trans, iter);
b = path->l[path->level].b;
}
@@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
- BUG_ON(iter->path->uptodate);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1799,23 +1862,15 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
- } else {
- if (!btree_path_node(iter->path, iter->path->level))
- return true;
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- iter->advanced = true;
- return false;
- }
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
}
static noinline
-struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- struct btree_insert_entry *i;
- struct bkey_i *ret = NULL;
+ struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
- trans_for_each_update(iter->trans, i) {
- if (i->btree_id < iter->btree_id)
- continue;
- if (i->btree_id > iter->btree_id)
- break;
- if (bpos_lt(i->k->k.p, iter->path->pos))
- continue;
- if (i->key_cache_already_flushed)
- continue;
- if (!ret || bpos_lt(i->k->k.p, ret->k.p))
- ret = i->k;
- }
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_le(i->k->k.p, iter->pos) &&
+ bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
+}
- return ret;
+static noinline
+void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bpos end = path_l(path)->b->key.k.p;
+
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_ge(i->k->k.p, path->pos) &&
+ bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+static noinline
+void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- return iter->flags & BTREE_ITER_WITH_UPDATES
- ? __bch2_btree_trans_peek_updates(iter)
- : NULL;
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_eq(i->k->k.p, iter->pos)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end_pos)
{
- struct bkey_i *k;
-
- if (bpos_lt(iter->path->pos, iter->journal_pos))
- iter->journal_idx = 0;
+ struct btree_path *path = btree_iter_path(trans, iter);
- k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
- iter->path->level,
- iter->path->pos,
- end_pos,
- &iter->journal_idx);
-
- iter->journal_pos = k ? k->k.p : end_pos;
- return k;
+ return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ path->level,
+ path->pos,
+ end_pos,
+ &iter->journal_idx);
}
static noinline
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
if (k) {
iter->k = k->k;
@@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
+ struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+ k.k ? k.k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
@@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
iter->flags|BTREE_ITER_CACHED) ?:
- bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+ bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
- k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
iter->k = u;
k.k = &iter->k;
@@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bkey_i *next_update;
struct bkey_s_c k, k2;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(btree_iter_path(trans, iter)->cached);
bch2_btree_iter_verify(iter);
while (1) {
@@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- l = path_l(iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
@@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
k = btree_trans_peek_journal(trans, iter, k);
- next_update = btree_trans_peek_updates(iter);
-
- if (next_update &&
- bpos_le(next_update->k.p,
- k.k ? k.k->p : l->b->key.k.p)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- }
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_updates(trans, iter, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
struct bpos iter_pos;
int ret;
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
bch2_btree_iter_verify_entry_exit(iter);
@@ -2100,10 +2162,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
goto end;
if (iter->update_path &&
- !bkey_eq(iter->update_path->pos, k.k->p)) {
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
@@ -2123,7 +2185,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2179,14 +2241,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
- ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+ ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(iter->update_path);
+ btree_path_set_should_be_locked(trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
@@ -2208,103 +2270,6 @@ end:
}
/**
- * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
- * equal to iterator's current position, returning keys from every level of the
- * btree. For keys at different levels of the btree that compare equal, the key
- * from the lower level (leaf) is returned first.
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(iter->path->cached);
- bch2_btree_iter_verify(iter);
- BUG_ON(iter->path->level < iter->min_depth);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* Already at end? */
- if (!btree_path_node(iter->path, iter->path->level)) {
- k = bkey_s_c_null;
- goto out_no_locked;
- }
-
- k = btree_path_level_peek_all(trans->c,
- &iter->path->l[iter->path->level], &iter->k);
-
- /* Check if we should go up to the parent node: */
- if (!k.k ||
- (iter->advanced &&
- bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
- iter->pos = path_l(iter->path)->b->key.k.p;
- btree_path_set_level_up(trans, iter->path);
- iter->advanced = false;
- continue;
- }
-
- /*
- * Check if we should go back down to a leaf:
- * If we're not in a leaf node, we only return the current key
- * if it exactly matches iter->pos - otherwise we first have to
- * go back to the leaf:
- */
- if (iter->path->level != iter->min_depth &&
- (iter->advanced ||
- !k.k ||
- !bpos_eq(iter->pos, k.k->p))) {
- btree_path_set_level_down(trans, iter->path, iter->min_depth);
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- /* Check if we should go to the next key: */
- if (iter->path->level == iter->min_depth &&
- iter->advanced &&
- k.k &&
- bpos_eq(iter->pos, k.k->p)) {
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- if (iter->advanced &&
- iter->path->level == iter->min_depth &&
- !bpos_eq(k.k->p, iter->pos))
- iter->advanced = false;
-
- BUG_ON(iter->advanced);
- BUG_ON(!k.k);
- break;
- }
-
- iter->pos = k.k->p;
- btree_path_set_should_be_locked(iter->path);
-out_no_locked:
- bch2_btree_iter_verify(iter);
-
- return k;
-}
-
-/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
* @iter: iterator to peek from
@@ -2330,14 +2295,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
- struct btree_path *saved_path = NULL;
struct bkey_s_c k;
struct bkey saved_k;
const struct bch_val *saved_v;
+ btree_path_idx_t saved_path = 0;
int ret;
- EBUG_ON(iter->path->cached || iter->path->level);
- EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+ EBUG_ON(btree_iter_path(trans, iter)->cached ||
+ btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
return bkey_s_c_err(-EIO);
@@ -2361,14 +2326,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
goto out_no_locked;
}
- k = btree_path_level_peek(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -2384,13 +2353,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
- saved_path = NULL;
+ saved_path = 0;
iter->k = saved_k;
k.v = saved_v;
goto got_key;
}
- if (bch2_snapshot_is_ancestor(iter->trans->c,
+ if (bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
@@ -2398,6 +2367,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
+ path = btree_iter_path(trans, iter);
saved_k = *k.k;
saved_v = k.v;
}
@@ -2414,10 +2384,11 @@ got_key:
continue;
}
+ btree_path_set_should_be_locked(path);
break;
- } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
- search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
@@ -2434,8 +2405,6 @@ got_key:
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
-
- btree_path_set_should_be_locked(iter->path);
out_no_locked:
if (saved_path)
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
@@ -2470,8 +2439,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
- EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2495,13 +2463,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
- struct bkey_i *next_update;
+ k = bkey_s_c_null;
- if ((next_update = btree_trans_peek_updates(iter)) &&
- bpos_eq(next_update->k.p, iter->pos)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- goto out;
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates)) {
+ bch2_btree_trans_peek_slot_updates(trans, iter, &k);
+ if (k.k)
+ goto out;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
@@ -2516,7 +2484,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
} else {
@@ -2526,7 +2494,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_IS_EXTENTS)
end.offset = U64_MAX;
- EBUG_ON(iter->path->level);
+ EBUG_ON(btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
@@ -2572,7 +2540,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2619,17 +2587,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
struct btree_path *path;
unsigned i;
- BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+ BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
BUG_ON(path->sorted_idx >= trans->nr_sorted);
- BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ BUG_ON(trans->sorted[path->sorted_idx] != i);
}
for (i = 0; i < trans->nr_sorted; i++) {
unsigned idx = trans->sorted[i];
- EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(!test_bit(idx, trans->paths_allocated));
BUG_ON(trans->paths[idx].sorted_idx != i);
}
}
@@ -2637,12 +2605,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
static void btree_trans_verify_sorted(struct btree_trans *trans)
{
struct btree_path *path, *prev = NULL;
- unsigned i;
+ struct trans_for_each_path_inorder_iter iter;
if (!bch2_debug_check_iterators)
return;
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (prev && btree_path_cmp(prev, path) > 0) {
__bch2_dump_trans_paths_updates(trans, true);
panic("trans paths out of order!\n");
@@ -2699,42 +2667,40 @@ out:
static inline void btree_path_list_remove(struct btree_trans *trans,
struct btree_path *path)
{
- unsigned i;
-
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
trans->nr_sorted--;
memmove_u64s_down_small(trans->sorted + path->sorted_idx,
trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
#else
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
-
- path->sorted_idx = U8_MAX;
}
static inline void btree_path_list_add(struct btree_trans *trans,
- struct btree_path *pos,
- struct btree_path *path)
+ btree_path_idx_t pos,
+ btree_path_idx_t path_idx)
{
- unsigned i;
+ struct btree_path *path = trans->paths + path_idx;
- path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+ path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
trans->nr_sorted++;
- trans->sorted[path->sorted_idx] = path->idx;
+ trans->sorted[path->sorted_idx] = path_idx;
#else
- array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
btree_trans_verify_sorted_refs(trans);
@@ -2751,9 +2717,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
iter->flags & BTREE_ITER_INTENT);
- iter->path = NULL;
- iter->update_path = NULL;
- iter->key_cache_path = NULL;
+ iter->path = 0;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -2784,41 +2751,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
iter->min_depth = depth;
- BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(iter->path->level != depth);
- BUG_ON(iter->min_depth != depth);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(path->level != depth);
+ BUG_ON(iter->min_depth != depth);
}
void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
{
+ struct btree_trans *trans = src->trans;
+
*dst = *src;
if (src->path)
- __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
- __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
- dst->key_cache_path = NULL;
+ __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = 0;
}
void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
+ struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
- size_t old_bytes = trans->mem_bytes;
- size_t new_bytes = roundup_pow_of_two(new_top);
+ unsigned old_bytes = trans->mem_bytes;
+ unsigned new_bytes = roundup_pow_of_two(new_top);
int ret;
void *new_mem;
void *p;
- trans->mem_max = max(trans->mem_max, new_top);
-
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ s->max_mem = max(s->max_mem, new_bytes);
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
kfree(trans->mem);
}
@@ -2838,7 +2810,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
@@ -2860,8 +2832,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
if (trans->srcu_held) {
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->cached && !btree_node_locked(path, 0))
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
@@ -2871,7 +2844,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
}
}
-void bch2_trans_srcu_lock(struct btree_trans *trans)
+static void bch2_trans_srcu_lock(struct btree_trans *trans)
{
if (!trans->srcu_held) {
trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
@@ -2893,14 +2866,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans)
u32 bch2_trans_begin(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
u64 now;
bch2_trans_reset_updates(trans);
trans->restart_count++;
trans->mem_top = 0;
+ trans->journal_entries = NULL;
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
/*
@@ -2917,15 +2892,21 @@ u32 bch2_trans_begin(struct btree_trans *trans)
* iterators if we do that
*/
if (!path->ref && !path->preserve)
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, i);
else
path->preserve = false;
}
now = local_clock();
+
+ if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
+ time_after64(now, trans->last_begin_time + 10))
+ __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+ trans->last_begin_time, now);
+
if (!trans->restarted &&
(need_resched() ||
- now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
drop_locks_do(trans, (cond_resched(), 0));
now = local_clock();
}
@@ -2944,32 +2925,11 @@ u32 bch2_trans_begin(struct btree_trans *trans)
return trans->restart_count;
}
-static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
-{
- struct btree_trans *trans;
-
- if (IS_ENABLED(__KERNEL__)) {
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
- if (trans)
- return trans;
- }
-
- trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
- /*
- * paths need to be zeroed, bch2_check_for_deadlock looks at
- * paths in other threads
- */
- memset(&trans->paths, 0, sizeof(trans->paths));
- return trans;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
unsigned bch2_trans_get_fn_idx(const char *fn)
{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
if (!bch2_btree_transaction_fns[i] ||
bch2_btree_transaction_fns[i] == fn) {
bch2_btree_transaction_fns[i] = fn;
@@ -2977,76 +2937,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
}
pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
- return i;
+ return 0;
}
struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
struct btree_trans *trans;
- struct btree_transaction_stats *s;
- trans = bch2_trans_alloc(c);
-
- memset(trans, 0, sizeof(*trans));
- trans->c = c;
- trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
- ? bch2_btree_transaction_fns[fn_idx] : NULL;
- trans->last_begin_time = local_clock();
- trans->fn_idx = fn_idx;
- trans->locking_wait.task = current;
- trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
- atomic_inc_not_zero(&c->journal_keys.ref);
- closure_init_stack(&trans->ref);
-
- s = btree_trans_stats(trans);
- if (s && s->max_mem) {
- unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-
- if (!unlikely(trans->mem)) {
- trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
- trans->mem_bytes = BTREE_TRANS_MEM_MAX;
- } else {
- trans->mem_bytes = expected_mem_bytes;
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans) {
+ memset(trans, 0, offsetof(struct btree_trans, list));
+ goto got_trans;
}
}
- if (s) {
- trans->nr_max_paths = s->nr_max_paths;
- trans->wb_updates_size = s->wb_updates_size;
- }
-
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+ memset(trans, 0, sizeof(*trans));
+ closure_init_stack(&trans->ref);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ seqmutex_lock(&c->btree_trans_lock);
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct btree_trans *pos;
+ pid_t pid = current->pid;
+
+ trans->locking_wait.task = current;
- seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
+ struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
/*
* We'd much prefer to be stricter here and completely
* disallow multiple btree_trans in the same thread -
* but the data move path calls bch2_write when we
* already have a btree_trans initialized.
*/
- BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+ BUG_ON(pos_task &&
+ pid == pos_task->pid &&
bch2_trans_locked(pos));
- if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
- seqmutex_unlock(&c->btree_trans_lock);
+ seqmutex_unlock(&c->btree_trans_lock);
+got_trans:
+ trans->c = c;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ atomic_inc_not_zero(&c->journal_keys.ref);
+ trans->nr_paths = ARRAY_SIZE(trans->_paths);
+ trans->paths_allocated = trans->_paths_allocated;
+ trans->sorted = trans->_sorted;
+ trans->paths = trans->_paths;
+ trans->updates = trans->_updates;
+
+ *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
+
+ trans->paths_allocated[0] = 1;
+
+ if (fn_idx < BCH_TRANSACTIONS_NR) {
+ trans->fn = bch2_btree_transaction_fns[fn_idx];
+
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
+
+ if (s->max_mem) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+ if (likely(trans->mem))
+ trans->mem_bytes = expected_mem_bytes;
+ }
+
+ trans->nr_paths_max = s->nr_max_paths;
+ trans->journal_entries_size = s->journal_entries_size;
}
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
return trans;
}
@@ -3055,14 +3031,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
goto leaked;
return;
leaked:
bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
bch2_btree_id_str(path->btree_id),
@@ -3075,26 +3052,14 @@ leaked:
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
- struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
- struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
- }
-
- closure_sync(&trans->ref);
-
- if (s)
- s->max_mem = max(s->max_mem, trans->mem_max);
-
trans_for_each_update(trans, i)
- __btree_path_put(i->path, true);
- trans->nr_updates = 0;
+ __btree_path_put(trans->paths + i->path, true);
+ trans->nr_updates = 0;
+ trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3103,8 +3068,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- kfree(trans->extra_journal_entries.data);
-
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
@@ -3117,6 +3080,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ unsigned long *paths_allocated = trans->paths_allocated;
+ trans->paths_allocated = NULL;
+ trans->paths = NULL;
+
+ if (paths_allocated != trans->_paths_allocated)
+ kfree_rcu_mightsleep(paths_allocated);
+
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
@@ -3125,8 +3095,16 @@ void bch2_trans_put(struct btree_trans *trans)
/* Userspace doesn't have a real percpu implementation: */
if (IS_ENABLED(__KERNEL__))
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
- if (trans)
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+
mempool_free(trans, &c->btree_trans_pool);
+ }
}
static void __maybe_unused
@@ -3154,24 +3132,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
{
- struct btree_path *path;
struct btree_bkey_cached_common *b;
static char lock_types[] = { 'r', 'i', 'w' };
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
unsigned l, idx;
+ /* before rcu_read_lock(): */
+ bch2_printbuf_make_room(out, 4096);
+
if (!out->nr_tabstops) {
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 32);
}
- prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+ prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
+
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ out->atomic++;
+
+ struct btree_path *paths = rcu_dereference(trans->paths);
+ if (!paths)
+ goto out;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
- trans_for_each_path_safe(trans, path, idx) {
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
+ struct btree_path *path = paths + idx;
if (!path->nodes_locked)
continue;
prt_printf(out, " path %u %c l=%u %s:",
- path->idx,
+ idx,
path->cached ? 'c' : 'b',
path->level,
bch2_btree_id_str(path->btree_id));
@@ -3199,6 +3191,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
}
+out:
+ --out->atomic;
+ rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3207,15 +3202,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
struct btree_trans *trans;
int cpu;
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu) {
+ struct btree_trans *trans =
+ per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+ }
+ kfree(trans);
+ }
+ free_percpu(c->btree_trans_bufs);
+
trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
if (trans)
panic("%s leaked btree_trans\n", trans->fn);
- if (c->btree_trans_bufs)
- for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
- free_percpu(c->btree_trans_bufs);
-
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
@@ -3236,6 +3242,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+ bch2_time_stats_init(&s->duration);
bch2_time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index eaffced4c1..24772538e4 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans)
__bch2_btree_trans_sort_paths(trans);
}
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned idx)
+static inline unsigned long *trans_paths_nr(struct btree_path *paths)
{
- u64 l;
-
- if (idx == BTREE_ITER_MAX)
- return NULL;
-
- l = trans->paths_allocated >> idx;
- if (!l)
- return NULL;
-
- idx += __ffs64(l);
- EBUG_ON(idx >= BTREE_ITER_MAX);
- EBUG_ON(trans->paths[idx].idx != idx);
- return &trans->paths[idx];
+ return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
}
-#define trans_for_each_path_from(_trans, _path, _start) \
- for (_path = __trans_next_path((_trans), _start); \
- (_path); \
- _path = __trans_next_path((_trans), (_path)->idx + 1))
-
-#define trans_for_each_path(_trans, _path) \
- trans_for_each_path_from(_trans, _path, 0)
-
-static inline struct btree_path *
-__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
{
- u64 l;
+ unsigned long *v = trans_paths_nr(paths);
+ return v - BITS_TO_LONGS(*v);
+}
- if (*idx == BTREE_ITER_MAX)
- return NULL;
+#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
+ for (_idx = _start; \
+ (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
+ _idx++)
- l = trans->paths_allocated >> *idx;
- if (!l)
- return NULL;
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned *idx)
+{
+ unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
+ /*
+ * Open coded find_next_bit(), because
+ * - this is fast path, we can't afford the function call
+ * - and we know that nr_paths is a multiple of BITS_PER_LONG,
+ */
+ while (*idx < trans->nr_paths) {
+ unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
+ if (v) {
+ *idx += __ffs(v);
+ return trans->paths + *idx;
+ }
+
+ *idx += BITS_PER_LONG;
+ *idx &= ~(BITS_PER_LONG - 1);
+ w++;
+ }
- *idx += __ffs64(l);
- EBUG_ON(*idx >= BTREE_ITER_MAX);
- return &trans->paths[*idx];
+ return NULL;
}
/*
* This version is intended to be safe for use on a btree_trans that is owned by
* another thread, for bch2_btree_trans_to_text();
*/
-#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \
+#define trans_for_each_path_from(_trans, _path, _idx, _start) \
for (_idx = _start; \
- (_path = __trans_next_path_safe((_trans), &_idx)); \
+ (_path = __trans_next_path((_trans), &_idx)); \
_idx++)
-#define trans_for_each_path_safe(_trans, _path, _idx) \
- trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+#define trans_for_each_path(_trans, _path, _idx) \
+ trans_for_each_path_from(_trans, _path, _idx, 1)
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
@@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru
: NULL;
}
-#define trans_for_each_path_inorder(_trans, _path, _i) \
- for (_i = 0; \
- ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
- _i++)
+#define trans_for_each_path_idx_inorder(_trans, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
+
+struct trans_for_each_path_inorder_iter {
+ btree_path_idx_t sorted_idx;
+ btree_path_idx_t path_idx;
+};
+
+#define trans_for_each_path_inorder(_trans, _path, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _path = (_trans)->paths + _iter.path_idx, \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
for (_i = trans->nr_sorted - 1; \
@@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path,
static inline struct btree_path *
__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
- unsigned idx)
+ unsigned *idx)
{
- struct btree_path *path = __trans_next_path(trans, idx);
+ struct btree_path *path;
- while (path && !__path_has_node(path, b))
- path = __trans_next_path(trans, path->idx + 1);
+ while ((path = __trans_next_path(trans, idx)) &&
+ !__path_has_node(path, b))
+ (*idx)++;
return path;
}
-#define trans_for_each_path_with_node(_trans, _b, _path) \
- for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
- (_path); \
- _path = __trans_next_path_with_node((_trans), (_b), \
- (_path)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \
+ for (_iter = 1; \
+ (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
+ _iter++)
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
- bool, unsigned long);
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
+ bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
+ btree_path_idx_t path, bool intent,
unsigned long ip)
{
- if (path->ref > 1 || path->preserve)
+ if (trans->paths[path].ref > 1 ||
+ trans->paths[path].preserve)
path = __bch2_btree_path_make_mut(trans, path, intent, ip);
- path->should_be_locked = false;
+ trans->paths[path].should_be_locked = false;
return path;
}
-struct btree_path * __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
- struct bpos, bool, unsigned long, int);
+btree_path_idx_t __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
+ struct bpos, bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip)
+ btree_path_idx_t path, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- int cmp = bpos_cmp(new_pos, path->pos);
-
- return cmp
- ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+ return !bpos_eq(new_pos, trans->paths[path].pos)
+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
: path;
}
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
+ btree_path_idx_t,
unsigned, unsigned long);
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- struct btree_path *path, unsigned flags)
+ btree_path_idx_t path, unsigned flags)
{
- if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
-int __must_check bch2_btree_path_traverse(struct btree_trans *,
- struct btree_path *, unsigned);
-struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
@@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
@@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
-
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
+ struct btree_trans *trans = iter->trans;
+
if (unlikely(iter->update_path))
- bch2_path_put(iter->trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
new_pos.snapshot = iter->snapshot;
@@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
unsigned btree_id,
unsigned flags)
{
- if (flags & BTREE_ITER_ALL_LEVELS)
- flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
@@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- memset(iter, 0, sizeof(*iter));
- iter->trans = trans;
- iter->btree_id = btree_id;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k.p = pos;
-
+ iter->trans = trans;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->btree_id = btree_id;
+ iter->min_depth = 0;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k = POS_KEY(pos);
+ iter->journal_idx = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif
@@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- if (!iter->trans->restarted)
- iter->path->preserve = false;
+ struct btree_trans *trans = iter->trans;
+
+ if (!trans->restarted)
+ btree_iter_path(trans, iter)->preserve = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
{
- size = roundup(size, 8);
+ size = round_up(size, 8);
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
@@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
KEY_TYPE_##_type, sizeof(*_val), _val)
void bch2_trans_srcu_unlock(struct btree_trans *);
-void bch2_trans_srcu_lock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
@@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *);
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
-
return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
@@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
- flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
@@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
return bch2_btree_iter_peek_slot(iter);
}
+int __bch2_btree_trans_too_many_iters(struct btree_trans *);
+
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
- trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
- }
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ return __bch2_btree_trans_too_many_iters(trans);
return 0;
}
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
+/*
+ * goto instead of loop, so that when used inside for_each_btree_key2()
+ * break/continue work correctly
+ */
#define lockrestart_do(_trans, _do) \
({ \
+ __label__ transaction_restart; \
u32 _restart_count; \
int _ret2; \
+transaction_restart: \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret2 = (_do); \
\
- do { \
- _restart_count = bch2_trans_begin(_trans); \
- _ret2 = (_do); \
- } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
+ if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \
+ goto transaction_restart; \
\
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
- \
_ret2; \
})
@@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key2(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
- if (!(_k).k) \
- break; \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
+ _end, (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
})
-#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
-({ \
- int _ret3 = 0; \
- \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
- if (!(_k).k) \
- break; \
- \
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
+ SPOS_MAX, _flags, _k, _do)
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
- if (!(_k).k) { \
- _ret3 = 0; \
- break; \
- } \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_rewind(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
@@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \
- &(_iter), _end, _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_s_c k;
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(iter, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+#define for_each_btree_key_old(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
@@ -863,24 +805,25 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
- for (; \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for (; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
for (; \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+ SPOS_MAX, _flags, _k, _ret)
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
@@ -912,10 +855,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_p; \
})
-/* new multiple iterator interface: */
-
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ec52f50d24..719a94a849 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
@@ -86,12 +87,26 @@ search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
- !k->overwritten)
+ if (k->overwritten) {
+ (*idx)++;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
@@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
- BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys)
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
- src[0].btree_id == src[1].btree_id &&
- src[0].level == src[1].level &&
- bpos_eq(src[0].k->k.p, src[1].k->k.p))
+ !journal_key_cmp(src, src + 1))
src++;
*dst++ = *src++;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1b7a5668df..74e52fd28a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (ret)
goto out;
- ck = (void *) c_iter.path->l[0].b;
+ ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
if (!ck)
goto out;
@@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
+ trans->journal_res.seq = ck->journal.seq;
+
/*
- * Since journal reclaim depends on us making progress here, and the
- * allocator/copygc depend on journal reclaim making progress, we need
- * to be using alloc reserves:
+ * If we're at the end of the journal, we really want to free up space
+ * in the journal right away - we don't want to pin that old journal
+ * sequence number with a new btree node write, we want to re-journal
+ * the update
*/
+ if (ck->journal.seq == journal_last_seq(j))
+ commit_flags |= BCH_WATERMARK_reclaim;
+
+ if (ck->journal.seq != journal_last_seq(j) ||
+ j->watermark == BCH_WATERMARK_stripe)
+ commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- (ck->journal.seq == journal_last_seq(j)
- ? BCH_WATERMARK_reclaim
- : 0)|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
commit_flags);
bch2_fs_fatal_err_on(ret &&
@@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_journal_pin_drop(j, &ck->journal);
- BUG_ON(!btree_node_locked(c_iter.path, 0));
+ struct btree_path *path = btree_iter_path(trans, &c_iter);
+ BUG_ON(!btree_node_locked(path, 0));
if (!evict) {
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
}
} else {
struct btree_path *path2;
+ unsigned i;
evict:
- trans_for_each_path(trans, path2)
- if (path2 != c_iter.path)
+ trans_for_each_path(trans, path2, i)
+ if (path2 != path)
__bch2_btree_path_unlock(trans, path2);
- bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+ bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
- mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
}
@@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
- ret = commit_do(trans, NULL, NULL, 0,
+ ret = lockrestart_do(trans,
btree_key_cache_flush_pos(trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false));
+ BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@@ -742,28 +751,12 @@ unlock:
return ret;
}
-/*
- * Flush and evict a key from the key cache:
- */
-int bch2_btree_key_cache_flush(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached_key key = { id, pos };
-
- /* Fastpath - assume it won't be found: */
- if (!bch2_btree_key_cache_find(c, id, pos))
- return 0;
-
- return btree_key_cache_flush_pos(trans, key, 0, 0, true);
-}
-
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
unsigned flags,
struct btree_insert_entry *insert_entry)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
struct bkey_i *insert = insert_entry->k;
bool kick_reclaim = false;
@@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_inc(&c->btree_key_cache.nr_dirty);
@@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_WAS_RW, &c->flags))
+ test_bit(BCH_FS_was_rw, &c->flags))
panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
atomic_long_read(&bc->nr_dirty));
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index be3acde2ca..e6b2cd0dd2 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
struct btree_insert_entry *);
-int bch2_btree_key_cache_flush(struct btree_trans *,
- enum btree_id, struct bpos);
void bch2_btree_key_cache_drop(struct btree_trans *,
struct btree_path *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 3d48834d09..6843974423 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
{
struct btree_path *path;
struct six_lock_count ret;
+ unsigned i;
memset(&ret, 0, sizeof(ret));
if (IS_ERR_OR_NULL(b))
return ret;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path != skip && &path->l[level].b->c == b) {
int t = btree_node_locked_type(path, level);
@@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
prt_printf(out, "Found lock cycle (%u entries):", g->nr);
prt_newline(out);
- for (i = g->g; i < g->g + g->nr; i++)
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
+ if (!task)
+ continue;
+
bch2_btree_trans_to_text(out, i->trans);
+ bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
+ }
}
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
@@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
+ struct task_struct *task = i->trans->locking_wait.task;
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ prt_printf(out, "%u ", task ?task->pid : 0);
}
prt_newline(out);
}
@@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g)
return false;
}
+static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ count_event(c, trans_restart_would_deadlock);
+
+ if (trace_trans_restart_would_deadlock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ buf.atomic++;
+ print_cycle(&buf, g);
+
+ trace_trans_restart_would_deadlock(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
- trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ trace_would_deadlock(g, i->trans);
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
@@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
@@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
struct lock_graph g;
struct trans_waiting_for_lock *top;
struct btree_bkey_cached_common *b;
- struct btree_path *path;
- unsigned path_idx;
- int ret;
+ btree_path_idx_t path_idx;
+ int ret = 0;
+
+ g.nr = 0;
if (trans->lock_must_abort) {
if (cycle)
return -1;
- trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ trace_would_deadlock(&g, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
- g.nr = 0;
lock_graph_down(&g, trans);
+
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ if (cycle)
+ cycle->atomic++;
next:
if (!g.nr)
- return 0;
+ goto out;
top = &g.g[g.nr - 1];
- trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+ struct btree_path *paths = rcu_dereference(top->trans->paths);
+ if (!paths)
+ goto up;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
+ path_idx, top->path_idx) {
+ struct btree_path *path = paths + path_idx;
if (!path->nodes_locked)
continue;
@@ -348,18 +386,23 @@ next:
ret = lock_graph_descend(&g, trans, cycle);
if (ret)
- return ret;
+ goto out;
goto next;
}
raw_spin_unlock(&b->lock.wait_lock);
}
}
-
+up:
if (g.nr > 1 && cycle)
print_chain(cycle, &g);
lock_graph_up(&g);
goto next;
+out:
+ if (cycle)
+ --cycle->atomic;
+ rcu_read_unlock();
+ return ret;
}
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
@@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
- unsigned i;
+ unsigned i, iter;
int ret;
/*
@@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
* already taken are no longer needed:
*/
- trans_for_each_path(trans, linked) {
+ trans_for_each_path(trans, linked, iter) {
if (!linked->nodes_locked)
continue;
@@ -588,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@@ -599,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@@ -624,8 +666,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- struct btree_path *linked;
-
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
@@ -648,8 +688,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
- if (!path->cached && !trans->in_traverse_all)
- trans_for_each_path(trans, linked)
+ if (!path->cached && !trans->in_traverse_all) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path(trans, linked, i)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
@@ -657,6 +700,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true, NULL);
}
+ }
return false;
}
@@ -665,7 +709,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- unsigned l;
+ unsigned l, old_locks_want = path->locks_want;
if (trans->restarted)
return;
@@ -689,8 +733,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
bch2_btree_path_verify_locks(path);
- path->downgrade_seq++;
- trace_path_downgrade(trans, _RET_IP_, path);
+ trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
/* Btree transaction locking: */
@@ -698,40 +741,70 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (trans->restarted)
return;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
+
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
- trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ !btree_path_get_locks(trans, path, false, &f)) {
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=",
+ f.l, path->l[f.l].lock_seq);
+ if (IS_ERR_OR_NULL(f.b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+ } else {
+ prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f.b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+ }
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
+ }
+
return 0;
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ !bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
@@ -740,16 +813,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
@@ -762,8 +837,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->nodes_locked)
return true;
return false;
@@ -809,8 +885,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_verify_locks(path);
}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11b0a2c8cd..4bd72c855d 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- if (s)
- __bch2_time_stats_update(&s->lock_hold_times,
- path->l[level].lock_taken_time,
- local_clock());
+ __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
#endif
}
@@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
struct btree *b)
{
struct btree_path *linked;
+ unsigned i;
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
@@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
linked->l[b->c.level].lock_seq++;
six_unlock_write(&b->c.lock);
@@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
enum btree_node_locked_type want)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->lock, (enum six_lock_type) want);
@@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -314,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
- struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@@ -355,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 12907beda9..30d69a6d13 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -12,6 +12,7 @@
#include "errcode.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "snapshot.h"
@@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#endif
}
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
{
- return i->path->l + i->level;
+ return (trans->paths + i->path)->l + i->level;
}
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
- insert_l(&i[0])->b == insert_l(&i[-1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
}
static inline bool same_leaf_as_next(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(&i[0])->b == insert_l(&i[1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
}
inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
@@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
if (same_leaf_as_prev(trans, i))
continue;
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
@@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trans->write_locked = true;
@@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_btree_node_unlock_write_inlined(trans,
+ trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
}
}
@@ -142,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -163,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
+ push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -287,7 +283,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_btree_node_dirty_acct(c, b);
}
@@ -311,10 +307,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
- BUG_ON(i->cached != i->path->cached);
- BUG_ON(i->level != i->path->level);
- BUG_ON(i->btree_id != i->path->btree_id);
+ struct btree_path *path = trans->paths + i->path;
+
+ BUG_ON(!bpos_eq(i->k->k.p, path->pos));
+ BUG_ON(i->cached != path->cached);
+ BUG_ON(i->level != path->level);
+ BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
@@ -349,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
+ if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@@ -361,8 +357,6 @@ noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
@@ -372,7 +366,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
@@ -401,7 +395,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) path->l[0].b;
- struct btree_insert_entry *i;
unsigned new_u64s;
struct bkey_i *new_k;
@@ -409,7 +402,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
- !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ !(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -422,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@@ -452,25 +445,15 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
- return 0;
-
- if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- old, bkey_i_to_s_c(new),
+ if (old_ops->trigger == new_ops->trigger) {
+ ret = bch2_key_trigger(trans, i->btree_id, i->level,
+ old, bkey_i_to_s(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
-
- _deleted.p = i->path->pos;
-
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, i->btree_id, i->level,
- old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
+ bkey_i_to_s(new), flags) ?:
+ bch2_key_trigger_old(trans, i->btree_id, i->level,
+ old, flags);
}
return ret;
@@ -488,6 +471,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+ unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
verify_update_old_key(trans, i);
@@ -497,19 +481,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger) {
+ old_ops->trigger == new_ops->trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
- return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|
- i->flags) ?: 1;
+ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
} else if (overwrite && !i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
- return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
} else if (!overwrite && !i->insert_trigger_run) {
i->insert_trigger_run = true;
- return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
return 0;
}
@@ -551,7 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ struct btree_insert_entry *btree_id_start = trans->updates;
unsigned btree_id = 0;
int ret = 0;
@@ -597,10 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0;
-
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@@ -608,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+ gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ return 0;
}
static inline int
@@ -624,8 +604,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
int ret;
@@ -650,23 +628,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
u64s += i->k->k.u64s;
ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, i->path, u64s);
+ ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
+ : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
}
- }
- if (trans->nr_wb_updates &&
- trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
- return -BCH_ERR_btree_insert_need_flush_buffer;
+ i->k->k.needs_whiteout = false;
+ }
/*
* Don't get journal reservation until after we know insert will
* succeed:
*/
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
@@ -675,8 +651,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans);
- } else {
- trans->journal_res.seq = c->journal.replay_journal_seq;
}
/*
@@ -685,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@@ -698,13 +672,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
- if (trans->nr_wb_updates) {
- EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
- ret = bch2_btree_insert_keys_write_buffer(trans);
- if (ret)
- goto revert_fs_usage;
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
h = trans->hooks;
while (h) {
@@ -715,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
+ if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@@ -727,16 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
- if (unlikely(trans->extra_journal_entries.nr)) {
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries.data,
- trans->extra_journal_entries.nr);
-
- trans->journal_res.offset += trans->extra_journal_entries.nr;
- trans->journal_res.u64s -= trans->extra_journal_entries.nr;
- }
-
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -765,33 +725,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bkey_copy((struct bkey_i *) entry->start, i->k);
}
- trans_for_each_wb_update(trans, wb) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- wb->btree, 0,
- wb->k.k.u64s);
- bkey_copy((struct bkey_i *) entry->start, &wb->k);
- }
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+ trans->journal_entries,
+ trans->journal_entries_u64s);
+
+ trans->journal_res.offset += trans->journal_entries_u64s;
+ trans->journal_res.u64s -= trans->journal_entries_u64s;
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
trans_for_each_update(trans, i) {
- i->k->k.needs_whiteout = false;
+ struct btree_path *path = trans->paths + i->path;
if (!i->cached) {
- u64 seq = trans->journal_res.seq;
-
- if (i->flags & BTREE_UPDATE_PREJOURNAL)
- seq = i->seq;
-
- bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+ bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
} else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
else {
- bch2_btree_key_cache_drop(trans, i->path);
- btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_key_cache_drop(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
}
@@ -806,14 +760,8 @@ revert_fs_usage:
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
- trans_for_each_wb_update(trans, wb)
- bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
@@ -841,6 +789,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL;
}
+static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
+ struct jset_entry *i)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_journal_entry_to_text(&buf, c, i);
+ prt_newline(&buf);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+
+ return -EINVAL;
+}
+
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -849,7 +824,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0, u64s_delta = 0;
trans_for_each_update(trans, i) {
@@ -884,13 +858,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin, NULL);
+ trans->journal_pin,
+ bch2_trans_commit_journal_pin_flush);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&c->journal, &trans->journal_res);
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
@@ -916,7 +892,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
case -BCH_ERR_btree_insert_btree_node_full:
ret = bch2_btree_split_leaf(trans, i->path, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+ trace_and_count(c, trans_restart_btree_node_split, trans,
+ trace_ip, trans->paths + i->path);
break;
case -BCH_ERR_btree_insert_need_mark_replicas:
ret = drop_locks_do(trans,
@@ -927,7 +904,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
- if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
@@ -950,30 +927,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
ret = bch2_trans_relock(trans);
break;
- case -BCH_ERR_btree_insert_need_flush_buffer: {
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- ret = 0;
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_unlock(trans);
- mutex_lock(&wb->flush_lock);
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_begin(trans);
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- } else {
- mutex_unlock(&wb->flush_lock);
- ret = bch2_trans_relock(trans);
- }
- }
- break;
- }
default:
BUG_ON(ret >= 0);
break;
@@ -982,8 +935,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- !(flags & BTREE_INSERT_NOWAIT) &&
- (flags & BTREE_INSERT_NOFAIL), c,
+ (flags & BCH_TRANS_COMMIT_no_enospc), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
@@ -995,8 +947,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
- test_bit(BCH_FS_STARTED, &c->flags))
+ if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
+ test_bit(BCH_FS_started, &c->flags))
return -BCH_ERR_erofs_trans_commit;
ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
@@ -1016,7 +968,6 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0;
trans_for_each_update(trans, i) {
@@ -1030,18 +981,15 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
+ struct btree_insert_entry *errored_at = NULL;
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i = NULL;
- struct btree_write_buffered_key *wb;
int ret = 0;
if (!trans->nr_updates &&
- !trans->nr_wb_updates &&
- !trans->extra_journal_entries.nr)
+ !trans->journal_entries_u64s)
goto out_reset;
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
@@ -1051,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0;
- if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@@ -1064,47 +1012,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
return ret;
}
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bkey_invalid_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+ if (unlikely(bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags)))
+ ret = bch2_trans_commit_journal_entry_invalid(trans, i);
+
+ if (ret)
+ return ret;
+ }
+
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
}
- if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
goto out_reset;
}
- if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
- mutex_trylock(&c->btree_write_buffer.flush_lock)) {
- bch2_trans_begin(trans);
- bch2_trans_unlock(trans);
-
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- goto out;
- }
-
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_u64s = trans->journal_entries_u64s;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
- EBUG_ON(!i->path->should_be_locked);
+ struct btree_path *path = trans->paths + i->path;
+
+ EBUG_ON(!path->should_be_locked);
- ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
if (unlikely(ret))
goto out;
- EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+ EBUG_ON(!btree_node_intent_locked(path, i->level));
if (i->key_cache_already_flushed)
continue;
@@ -1120,22 +1073,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
- trans_for_each_wb_update(trans, wb)
- trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
- if (trans->extra_journal_res) {
+ if (trans->extra_disk_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_journal_res,
- (flags & BTREE_INSERT_NOFAIL)
+ trans->extra_disk_res,
+ (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
+ errored_at = NULL;
bch2_trans_verify_not_in_restart(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+ ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
bch2_trans_verify_locks(trans);
@@ -1145,7 +1097,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
@@ -1154,9 +1106,21 @@ out_reset:
return ret;
err:
- ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+ ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
if (ret)
goto out;
+ /*
+ * We might have done another transaction commit in the error path -
+ * i.e. btree write buffer flush - which will have made use of
+ * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+ * how the journal sequence number to pin is passed in - so we must
+ * restart:
+ */
+ if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
goto retry;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 60453ba86c..4a5a64499e 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -185,33 +185,32 @@ struct btree_node_iter {
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
-#define __BTREE_ITER_FLAGS_END 16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
+#define __BTREE_ITER_FLAGS_END 15
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -223,13 +222,12 @@ enum btree_path_uptodate {
#define TRACK_PATH_ALLOCATED
#endif
+typedef u16 btree_path_idx_t;
+
struct btree_path {
- u8 idx;
- u8 sorted_idx;
+ btree_path_idx_t sorted_idx;
u8 ref;
u8 intent_ref;
- u32 alloc_seq;
- u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
@@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
*/
struct btree_iter {
struct btree_trans *trans;
- struct btree_path *path;
- struct btree_path *update_path;
- struct btree_path *key_cache_path;
+ btree_path_idx_t path;
+ btree_path_idx_t update_path;
+ btree_path_idx_t key_cache_path;
enum btree_id btree_id:8;
- unsigned min_depth:3;
- unsigned advanced:1;
+ u8 min_depth;
/* btree_iter_copy starts here: */
u16 flags;
@@ -306,7 +303,6 @@ struct btree_iter {
/* BTREE_ITER_WITH_JOURNAL: */
size_t journal_idx;
- struct bpos journal_pos;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
@@ -354,16 +350,16 @@ struct btree_insert_entry {
* to the size of the key being overwritten in the btree:
*/
u8 old_btree_u64s;
+ btree_path_idx_t path;
struct bkey_i *k;
- struct btree_path *path;
- u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
unsigned long ip_allocated;
};
-#define BTREE_ITER_MAX 64
+#define BTREE_ITER_INITIAL 64
+#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -377,25 +373,30 @@ struct btree_trans_commit_hook {
#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
+struct btree_trans_paths {
+ unsigned long nr_paths;
+ struct btree_path paths[];
+};
+
struct btree_trans {
struct bch_fs *c;
- const char *fn;
- struct closure ref;
- struct list_head list;
- u64 last_begin_time;
- u8 lock_may_not_fail;
- u8 lock_must_abort;
- struct btree_bkey_cached_common *locking;
- struct six_lock_waiter locking_wait;
+ unsigned long *paths_allocated;
+ struct btree_path *paths;
+ btree_path_idx_t *sorted;
+ struct btree_insert_entry *updates;
- int srcu_idx;
+ void *mem;
+ unsigned mem_top;
+ unsigned mem_bytes;
+ btree_path_idx_t nr_sorted;
+ btree_path_idx_t nr_paths;
+ btree_path_idx_t nr_paths_max;
u8 fn_idx;
- u8 nr_sorted;
u8 nr_updates;
- u8 nr_wb_updates;
- u8 wb_updates_size;
+ u8 lock_must_abort;
+ bool lock_may_not_fail:1;
bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
@@ -407,41 +408,59 @@ struct btree_trans {
bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
+
+ u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
unsigned long srcu_lock_time;
- /*
- * For when bch2_trans_update notices we'll be splitting a compressed
- * extent:
- */
- unsigned extra_journal_res;
- unsigned nr_max_paths;
-
- u64 paths_allocated;
-
- unsigned mem_top;
- unsigned mem_max;
- unsigned mem_bytes;
- void *mem;
-
- u8 sorted[BTREE_ITER_MAX + 8];
- struct btree_path paths[BTREE_ITER_MAX];
- struct btree_insert_entry updates[BTREE_ITER_MAX];
- struct btree_write_buffered_key *wb_updates;
+ const char *fn;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+ int srcu_idx;
/* update path: */
+ u16 journal_entries_u64s;
+ u16 journal_entries_size;
+ struct jset_entry *journal_entries;
+
struct btree_trans_commit_hook *hooks;
- darray_u64 extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
+
+ struct bch_fs_usage_base fs_usage_delta;
+
unsigned journal_u64s;
+ unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
+
+ /* Entries before this are zeroed out on every bch2_trans_get() call */
+
+ struct list_head list;
+ struct closure ref;
+
+ unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
+ struct btree_trans_paths trans_paths;
+ struct btree_path _paths[BTREE_ITER_INITIAL];
+ btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
+ struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
};
+static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return trans->paths + iter->path;
+}
+
+static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return iter->key_cache_path
+ ? trans->paths + iter->key_cache_path
+ : NULL;
+}
+
#define BCH_BTREE_WRITE_TYPES() \
x(initial, 0) \
x(init_next_bset, 1) \
@@ -637,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@@ -645,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -722,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 2fd3c8cc6f..c3ff365acc 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
struct bkey_i *, enum btree_update_flags,
unsigned long ip);
@@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
*/
if (nr_splits > 1 &&
(compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_journal_res += compressed_sectors * (nr_splits - 1);
+ trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
if (front_split) {
update = bch2_bkey_make_mut_noupdate(trans, old);
@@ -339,21 +339,22 @@ err:
}
static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_path *path,
struct btree_insert_entry *i,
enum btree_update_flags flags,
unsigned long ip)
{
- struct btree_path *btree_path;
struct bkey k;
int ret;
- btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ btree_path_idx_t path_idx =
+ bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
+ struct btree_path *btree_path = trans->paths + path_idx;
+
/*
* The old key in the insert entry might actually refer to an existing
* key in the btree that has been deleted from cache and not yet
@@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->flags |= BTREE_TRIGGER_NORUN;
btree_path_set_should_be_locked(btree_path);
- ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+ ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
- bch2_path_put(trans, btree_path, true);
+ bch2_path_put(trans, path_idx, true);
return ret;
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
struct bkey_i *k, enum btree_update_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
- u64 seq = 0;
int cmp;
+ struct btree_path *path = trans->paths + path_idx;
EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ EBUG_ON(trans->nr_updates >= trans->nr_paths);
EBUG_ON(!bpos_eq(k->k.p, path->pos));
- /*
- * The transaction journal res hasn't been allocated at this point.
- * That occurs at commit time. Reuse the seq field to pass in the seq
- * of a prejournaled key.
- */
- if (flags & BTREE_UPDATE_PREJOURNAL)
- seq = trans->journal_res.seq;
-
n = (struct btree_insert_entry) {
.flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,
.cached = path->cached,
- .path = path,
+ .path = path_idx,
.k = k,
- .seq = seq,
.ip_allocated = ip,
};
@@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
*/
- trans_for_each_update(trans, i) {
+ for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
cmp = btree_insert_entry_cmp(&n, i);
if (cmp <= 0)
break;
@@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached;
i->k = n.k;
i->path = n.path;
- i->seq = n.seq;
i->ip_allocated = n.ip_allocated;
} else {
array_insert_item(trans->updates, trans->nr_updates,
@@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
}
}
- __btree_path_get(i->path, true);
+ __btree_path_get(trans->paths + i->path, true);
/*
* If a key is present in the key cache, it must also exist in the
@@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* work:
*/
if (path->cached && bkey_deleted(&i->old_k))
- return flush_new_cached_update(trans, path, i, flags, ip);
+ return flush_new_cached_update(trans, i, flags, ip);
return 0;
}
@@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
struct btree_iter *iter,
struct btree_path *path)
{
- if (!iter->key_cache_path ||
- !iter->key_cache_path->should_be_locked ||
- !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+ struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
+
+ if (!key_cache_path ||
+ !key_cache_path->should_be_locked ||
+ !bpos_eq(key_cache_path->pos, iter->pos)) {
struct bkey_cached *ck;
int ret;
@@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
iter->flags & BTREE_ITER_INTENT,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
if (unlikely(ret))
return ret;
- ck = (void *) iter->key_cache_path->l[0].b;
+ ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
}
return 0;
@@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
- struct btree_path *path = iter->update_path ?: iter->path;
+ btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
/*
* Ensure that updates to cached btrees go to the key cache:
*/
+ struct btree_path *path = trans->paths + path_idx;
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
@@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
if (ret)
return ret;
- path = iter->key_cache_path;
+ path_idx = iter->key_cache_path;
}
- return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
}
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
- struct btree_iter *iter, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- trans->journal_res.seq = seq;
- return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
- BTREE_UPDATE_PREJOURNAL);
-}
-
-static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
{
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
int ret = PTR_ERR_OR_ZERO(n);
@@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
return bch2_btree_insert_trans(trans, btree, n, 0);
}
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
{
- struct btree_write_buffered_key *i;
- int ret;
-
- EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
- EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
- if (unlikely(trans->journal_replay_not_finished))
- return bch2_btree_insert_clone_trans(trans, btree, k);
-
- trans_for_each_wb_update(trans, i) {
- if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
- bkey_copy(&i->k, k);
- return 0;
- }
- }
+ unsigned new_top = trans->journal_entries_u64s + u64s;
+ unsigned old_size = trans->journal_entries_size;
- if (!trans->wb_updates ||
- trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_write_buffered_key *u;
+ if (new_top > trans->journal_entries_size) {
+ trans->journal_entries_size = roundup_pow_of_two(new_top);
- if (trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- BUG_ON(trans->wb_updates_size > U8_MAX / 2);
- trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
- if (s)
- s->wb_updates_size = trans->wb_updates_size;
- }
-
- u = bch2_trans_kmalloc_nomemzero(trans,
- trans->wb_updates_size *
- sizeof(struct btree_write_buffered_key));
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- if (trans->nr_wb_updates)
- memcpy(u, trans->wb_updates, trans->nr_wb_updates *
- sizeof(struct btree_write_buffered_key));
- trans->wb_updates = u;
+ btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
}
- trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
- .btree = btree,
- };
+ struct jset_entry *n =
+ bch2_trans_kmalloc_nomemzero(trans,
+ trans->journal_entries_size * sizeof(u64));
+ if (IS_ERR(n))
+ return ERR_CAST(n);
- bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
- trans->nr_wb_updates++;
+ if (trans->journal_entries)
+ memcpy(n, trans->journal_entries, old_size * sizeof(u64));
+ trans->journal_entries = n;
- return 0;
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s = new_top;
+ return e;
}
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
@@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans,
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
}
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.p = pos;
- return bch2_trans_update_buffered(trans, btree, k);
-}
-
int bch2_btree_delete(struct btree_trans *trans,
enum btree_id btree, struct bpos pos,
unsigned update_flags)
@@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res);
err:
/*
@@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
- struct bkey_i *k;
- int ret = 0;
+ struct bkey_i k;
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- return ret;
+ bkey_init(&k.k);
+ k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k.k.p = pos;
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, k);
+ return bch2_trans_update_buffered(trans, btree, &k);
}
-__printf(2, 0)
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
{
- struct printbuf buf = PRINTBUF;
- struct jset_entry_log *l;
- unsigned u64s;
- int ret;
-
- prt_vprintf(&buf, fmt, args);
- ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
- ret = darray_make_room(entries, jset_u64s(u64s));
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
- goto err;
+ return ret;
- l = (void *) &darray_top(*entries);
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 1;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- memcpy(l->d, buf.buf, buf.pos);
- while (buf.pos & 7)
- l->d[buf.pos++] = '\0';
-
- entries->nr += jset_u64s(u64s);
-err:
- printbuf_exit(&buf);
- return ret;
+ struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
+ journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf->buf, buf->pos);
+ return 0;
}
__printf(3, 0)
@@ -908,16 +815,32 @@ static int
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
va_list args)
{
- int ret;
+ struct printbuf buf = PRINTBUF;
+ prt_vprintf(&buf, fmt, args);
+
+ unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+ prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
+
+ int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ goto err;
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+ ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
+ if (ret)
+ goto err;
+
+ struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
+ journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf.buf, buf.pos);
+ c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|commit_flags,
- __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+ BCH_TRANS_COMMIT_lazy_rw|commit_flags,
+ __bch2_trans_log_msg(trans, &buf, u64s));
}
-
+err:
+ printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9816d22865..b9382b7b28 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS() \
+ x(no_enospc, "don't check for enospc") \
+ x(no_check_rw, "don't attempt to take a ref on c->writes") \
+ x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
+ x(no_journal_res, "don't take a journal reservation, instead " \
+ "pin journal entry referred to by trans->journal_res.seq") \
+ x(journal_reclaim, "operation required for journal reclaim; may return error" \
+ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
- __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
- __BTREE_INSERT_NOCHECK_RW,
- __BTREE_INSERT_LAZY_RW,
- __BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RECLAIM,
- __BTREE_INSERT_NOWAIT,
- __BTREE_INSERT_GC_LOCK_HELD,
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
+ __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...) __BCH_TRANS_COMMIT_##n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
};
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
@@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos)
+{
+ return bch2_btree_bit_mod(trans, btree, pos, false);
+}
+
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
struct bpos, struct bpos);
@@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_buffered(struct btree_trans *,
- enum btree_id, struct bkey_i *);
+
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+
+static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
+{
+ return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+}
+
+static inline struct jset_entry *
+bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+{
+ if (!trans->journal_entries ||
+ trans->journal_entries_u64s + u64s > trans->journal_entries_size)
+ return __bch2_trans_jset_entry_alloc(trans, u64s);
+
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s += u64s;
+ return e;
+}
+
+int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+
+static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ if (unlikely(trans->journal_replay_not_finished))
+ return bch2_btree_insert_clone_trans(trans, btree, k);
+
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ return ret;
+
+ journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
+ bkey_copy(e->start, k);
+ return 0;
+}
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
@@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
- for ((_i) = (_trans)->updates; \
+ for (struct btree_insert_entry *_i = (_trans)->updates; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-#define trans_for_each_wb_update(_trans, _i) \
- for ((_i) = (_trans)->wb_updates; \
- (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
- (_i)++)
-
static inline void bch2_trans_reset_updates(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
bch2_path_put(trans, i->path, true);
- trans->extra_journal_res = 0;
trans->nr_updates = 0;
- trans->nr_wb_updates = 0;
- trans->wb_updates = NULL;
+ trans->journal_entries_u64s = 0;
trans->hooks = NULL;
- trans->extra_journal_entries.nr = 0;
+ trans->extra_disk_res = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 239fcc3c7c..4530b14ff2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -25,24 +25,24 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_path *, struct btree *,
+ btree_path_idx_t, struct btree *,
struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
+static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
{
- struct btree_path *path;
-
- path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
BTREE_ITER_NOPRESERVE|
BTREE_ITER_INTENT, _RET_IP_);
- path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
bch2_btree_path_downgrade(trans, path);
__bch2_btree_path_unlock(trans, path);
- return path;
+ return path_idx;
}
/* Debug code: */
@@ -159,14 +159,16 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
-static void __btree_node_free(struct bch_fs *c, struct btree *b)
+static void __btree_node_free(struct btree_trans *trans, struct btree *b)
{
- trace_and_count(c, btree_node_free, c, b);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, btree_node_free, trans, b);
BUG_ON(btree_node_write_blocked(b));
BUG_ON(btree_node_dirty(b));
@@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
struct btree_path *path;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -278,7 +280,8 @@ retry:
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
- c->opts.metadata_replicas_required,
+ min(res->nr_replicas,
+ c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
@@ -363,7 +366,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_and_count(c, btree_node_alloc, c, b);
+ trace_and_count(c, btree_node_alloc, trans, b);
bch2_increment_clock(c, btree_sectors(c), WRITE);
return b;
}
@@ -453,7 +456,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -466,7 +469,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
unsigned flags,
struct closure *cl)
{
- struct bch_fs *c = as->c;
struct btree *b;
unsigned interior;
int ret = 0;
@@ -476,11 +478,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
- *
- * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
- * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, cl);
if (ret)
return ret;
@@ -488,9 +487,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT ? NULL : cl,
- interior, flags);
+ b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+ interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -500,7 +498,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
}
}
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -559,24 +557,20 @@ static void btree_update_add_key(struct btree_update *as,
static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct btree_update *as)
{
- struct bkey_i *k;
- int ret;
-
- ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- memcpy(&darray_top(trans->extra_journal_entries),
- as->journal_entries,
- as->journal_u64s * sizeof(u64));
- trans->extra_journal_entries.nr += as->journal_u64s;
+ memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->old_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+ ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -584,7 +578,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
for_each_keylist_key(&as->new_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+ ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -645,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
@@ -655,10 +650,11 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s(): error %s", __func__, bch2_err_str(ret));
err:
if (as->b) {
- struct btree_path *path;
b = as->b;
- path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+ btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
/*
* @b is the node we did the final insert into:
*
@@ -728,7 +724,7 @@ err:
btree_node_write_if_need(c, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
- bch2_path_put(trans, path, true);
+ bch2_path_put(trans, path_idx, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -815,6 +811,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
@@ -825,7 +827,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+ bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -934,6 +937,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr];
}
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
@@ -985,11 +994,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1039,7 +1050,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
- int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
@@ -1057,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < c->journal.watermark) {
struct journal_res res = { 0 };
@@ -1087,16 +1098,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock)) {
+ if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
@@ -1110,7 +1119,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
@@ -1153,7 +1162,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
@@ -1183,6 +1192,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
return as;
err:
bch2_btree_update_free(as, trans);
+ if (!bch2_err_matches(ret, ENOSPC) &&
+ !bch2_err_matches(ret, EROFS))
+ bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1214,7 +1226,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_and_count(c, btree_node_set_root, c, b);
+ trace_and_count(c, btree_node_set_root, trans, b);
old = btree_node_root(c, b);
@@ -1390,7 +1402,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@@ -1445,10 +1457,12 @@ static void __btree_split_node(struct btree_update *as,
*/
static void btree_split_insert_keys(struct btree_update *as,
struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
struct btree *b,
struct keylist *keys)
{
+ struct btree_path *path = trans->paths + path_idx;
+
if (!bch2_keylist_empty(keys) &&
bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
struct btree_node_iter node_iter;
@@ -1462,25 +1476,25 @@ static void btree_split_insert_keys(struct btree_update *as,
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(path, b);
+ struct btree *parent = btree_node_parent(trans->paths + path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
- struct btree_path *path1 = NULL, *path2 = NULL;
+ btree_path_idx_t path1 = 0, path2 = 0;
u64 start_time = local_clock();
int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+ BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
- trace_and_count(c, btree_node_split, c, b);
+ trace_and_count(c, btree_node_split, trans, b);
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -1501,15 +1515,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n2);
+ mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n2);
/*
* Note that on recursive parent_keys == keys, so we
@@ -1526,11 +1540,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n3);
six_unlock_write(&n3->c.lock);
- path2->locks_want++;
- BUG_ON(btree_node_locked(path2, n3->c.level));
+ trans->paths[path2].locks_want++;
+ BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n3);
+ mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n3);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@@ -1538,7 +1552,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
}
} else {
- trace_and_count(c, btree_node_compact, c, b);
+ trace_and_count(c, btree_node_compact, trans, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
@@ -1551,10 +1565,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1568,10 +1582,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (ret)
goto err;
} else if (n3) {
- bch2_btree_set_root(as, trans, path, n3);
+ bch2_btree_set_root(as, trans, trans->paths + path, n3);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, path, n1);
+ bch2_btree_set_root(as, trans, trans->paths + path, n1);
}
if (n3) {
@@ -1591,13 +1605,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
if (n3)
- bch2_trans_node_add(trans, n3);
+ bch2_trans_node_add(trans, trans->paths + path, n3);
if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
+ bch2_trans_node_add(trans, trans->paths + path2, n2);
+ bch2_trans_node_add(trans, trans->paths + path1, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
@@ -1606,11 +1620,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_intent(&n1->c.lock);
out:
if (path2) {
- __bch2_btree_path_unlock(trans, path2);
+ __bch2_btree_path_unlock(trans, trans->paths + path2);
bch2_path_put(trans, path2, true);
}
if (path1) {
- __bch2_btree_path_unlock(trans, path1);
+ __bch2_btree_path_unlock(trans, trans->paths + path1);
bch2_path_put(trans, path1, true);
}
@@ -1638,13 +1652,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
struct keylist *keys)
{
struct btree_path *linked;
+ unsigned i;
__bch2_btree_insert_keys_interior(as, trans, path, b,
path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_trans_verify_paths(trans);
@@ -1655,7 +1670,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*
* @as: btree_update object
* @trans: btree_trans object
- * @path: path that points to current node
+ * @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
* @flags: transaction commit flags
@@ -1667,10 +1682,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path_idx, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1688,7 +1704,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
- if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
@@ -1723,19 +1739,22 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned flags)
{
- struct btree *b = path_l(path)->b;
+ /* btree_split & merge may both cause paths array to be reallocated */
+
+ struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
- as = bch2_btree_update_start(trans, path, path->level,
+ as = bch2_btree_update_start(trans, trans->paths + path,
+ trans->paths[path].level,
true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1748,20 +1767,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
bch2_btree_update_done(as, trans);
- for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+ for (l = trans->paths[path].level + 1;
+ btree_node_intent_locked(&trans->paths[path], l) && !ret;
+ l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_path *sib_path = NULL, *new_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
@@ -1769,13 +1789,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
+ enum btree_id btree = trans->paths[path].btree_id;
+ btree_path_idx_t sib_path = 0, new_path = 0;
u64 start_time = local_clock();
int ret = 0;
- BUG_ON(!path->should_be_locked);
- BUG_ON(!btree_node_locked(path, level));
+ BUG_ON(!trans->paths[path].should_be_locked);
+ BUG_ON(!btree_node_locked(&trans->paths[path], level));
- b = path->l[level].b;
+ b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
(sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
@@ -1787,18 +1809,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+ sib_path = bch2_path_get(trans, btree, sib_pos,
U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- btree_path_set_should_be_locked(sib_path);
+ btree_path_set_should_be_locked(trans->paths + sib_path);
- m = sib_path->l[level].b;
+ m = trans->paths[sib_path].l[level].b;
- if (btree_node_parent(path, b) !=
- btree_node_parent(sib_path, m)) {
+ if (btree_node_parent(trans->paths + path, b) !=
+ btree_node_parent(trans->paths + sib_path, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@@ -1851,14 +1873,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level, false,
- BTREE_INSERT_NOFAIL|flags);
+ parent = btree_node_parent(trans->paths + path, b);
+ as = bch2_btree_update_start(trans, trans->paths + path, level, false,
+ BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
- trace_and_count(c, btree_node_merge, c, b);
+ trace_and_count(c, btree_node_merge, trans, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
@@ -1882,10 +1904,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@@ -1903,10 +1925,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, path, b);
- bch2_btree_node_free_inmem(trans, sib_path, m);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + path, n);
bch2_trans_verify_paths(trans);
@@ -1934,16 +1956,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_path *new_path = NULL;
struct btree *n, *parent;
struct btree_update *as;
+ btree_path_idx_t new_path = 0;
int ret;
- flags |= BTREE_INSERT_NOFAIL;
+ flags |= BCH_TRANS_COMMIT_no_enospc;
- parent = btree_node_parent(iter->path, b);
- as = bch2_btree_update_start(trans, iter->path, b->c.level,
- false, flags);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto out;
@@ -1958,27 +1980,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
- trace_and_count(c, btree_node_rewrite, c, b);
+ trace_and_count(c, btree_node_rewrite, trans, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path, parent,
- &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path,
+ parent, &as->parent_keys, flags);
if (ret)
goto err;
} else {
- bch2_btree_set_root(as, trans, iter->path, n);
+ bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
}
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, iter->path, b);
+ bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + iter->path, n);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as, trans);
@@ -2047,8 +2069,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2071,7 +2092,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
mutex_lock(&c->pending_node_rewrites_lock);
list_add(&a->list, &c->pending_node_rewrites);
mutex_unlock(&c->pending_node_rewrites_lock);
@@ -2079,15 +2100,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
}
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- if (test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (test_bit(BCH_FS_started, &c->flags)) {
bch_err(c, "%s: error getting c->writes ref", __func__);
kfree(a);
return;
}
ret = bch2_fs_read_write_early(c);
+ bch_err_msg(c, ret, "going read-write");
if (ret) {
- bch_err_msg(c, ret, "going read-write");
kfree(a);
return;
}
@@ -2138,13 +2159,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key), 0);
- if (ret)
- return ret;
-
- ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
- new_key, 0);
+ ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key),
+ BTREE_TRIGGER_TRANSACTIONAL) ?:
+ bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s(new_key),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -2156,7 +2176,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
BUG_ON(ret);
}
- parent = btree_node_parent(iter->path, b);
+ parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
bch2_trans_copy_iter(&iter2, iter);
@@ -2164,10 +2184,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
iter2.flags & BTREE_ITER_INTENT,
_THIS_IP_);
- BUG_ON(iter2.path->level != b->c.level);
- BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+ struct btree_path *path2 = btree_iter_path(trans, &iter2);
+ BUG_ON(path2->level != b->c.level);
+ BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
- btree_path_set_level_up(trans, iter2.path);
+ btree_path_set_level_up(trans, path2);
trans->paths_sorted = false;
@@ -2178,23 +2199,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
} else {
BUG_ON(btree_node_root(c, b) != b);
- ret = darray_make_room(&trans->extra_journal_entries,
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
jset_u64s(new_key->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ journal_entry_set(e,
BCH_JSET_ENTRY_btree_root,
b->c.btree_id, b->c.level,
new_key, new_key->k.u64s);
- trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
if (ret)
goto err;
- bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+ bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -2209,7 +2230,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bkey_copy(&b->key, new_key);
}
- bch2_btree_node_unlock_write(trans, iter->path, b);
+ bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
out:
bch2_trans_iter_exit(trans, &iter2);
return ret;
@@ -2228,7 +2249,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
{
struct bch_fs *c = trans->c;
struct btree *new_hash = NULL;
- struct btree_path *path = iter->path;
+ struct btree_path *path = btree_iter_path(trans, iter);
struct closure cl;
int ret = 0;
@@ -2243,7 +2264,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
if (ret) {
ret = drop_locks_do(trans, (closure_sync(&cl), 0));
if (ret)
@@ -2267,7 +2288,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
six_unlock_intent(&new_hash->c.lock);
}
closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -2286,7 +2307,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
goto out;
/* has node been freed? */
- if (iter.path->l[b->c.level].b != b) {
+ if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
@@ -2328,12 +2349,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index a6668992a2..c593c925d1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
+ struct btree_path *path = trans->paths + path_idx;
struct btree *b;
EBUG_ON(!btree_node_locked(path, level));
@@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
- return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+ return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
}
static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags)
{
@@ -183,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
{
- return (void *) b->data + btree_bytes(c);
+ return (void *) b->data + btree_buf_bytes(b);
}
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+ return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
- return btree_data_end(c, b);
+ return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@@ -220,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
- struct btree *b,
- void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size >> 3;
+ ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@@ -234,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
- ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@@ -259,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, bne->keys.start);
+ __bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -280,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
- BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@@ -298,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
+ return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 4e6241db51..ac78448619 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -7,45 +7,143 @@
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
-#include <linux/sort.h>
+#include <linux/prefetch.h>
-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
+
+static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ return (cmp_int(l->hi, r->hi) ?:
+ cmp_int(l->mi, r->mi) ?:
+ cmp_int(l->lo, r->lo)) >= 0;
+}
- return cmp_int(l->btree, r->btree) ?:
- bpos_cmp(l->k.k.p, r->k.k.p) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+ int cmp;
+
+ asm("mov (%[l]), %%rax;"
+ "sub (%[r]), %%rax;"
+ "mov 8(%[l]), %%rax;"
+ "sbb 8(%[r]), %%rax;"
+ "mov 16(%[l]), %%rax;"
+ "sbb 16(%[r]), %%rax;"
+ : "=@ccae" (cmp)
+ : [l] "r" (l), [r] "r" (r)
+ : "rax", "cc");
+
+ EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
+ return cmp;
+#else
+ return __wb_key_ref_cmp(l, r);
+#endif
}
-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ const struct wb_key_ref *l = _l;
+ const struct wb_key_ref *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq);
+ return !((l->hi ^ r->hi)|
+ (l->mi ^ r->mi)|
+ ((l->lo >> 24) ^ (r->lo >> 24)));
}
-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- unsigned commit_flags,
- bool *write_locked,
- size_t *fast)
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+ size_t n = num, a = num / 2;
+
+ if (!a) /* num < 2 || size == 0 */
+ return;
+
+ for (;;) {
+ size_t b, c, d;
+
+ if (a) /* Building heap: sift down --a */
+ --a;
+ else if (--n) /* Sorting: Extract root to --n */
+ swap(base[0], base[n]);
+ else /* Sort complete */
+ break;
+
+ /*
+ * Sift element at "a" down into heap. This is the
+ * "bottom-up" variant, which significantly reduces
+ * calls to cmp_func(): we find the sift-down path all
+ * the way to the leaves (one compare per level), then
+ * backtrack to find where to insert the target element.
+ *
+ * Because elements tend to sift down close to the leaves,
+ * this uses fewer compares than doing two per level
+ * on the way down. (A bit more than half as many on
+ * average, 3/4 worst-case.)
+ */
+ for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+ b = wb_key_ref_cmp(base + c, base + d) ? c : d;
+ if (d == n) /* Special case last leaf with no sibling */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a" */
+ while (b != a && wb_key_ref_cmp(base + a, base + b))
+ b = (b - 1) / 2;
+ c = b; /* Where "a" belongs */
+ while (b != a) { /* Shift it into place */
+ b = (b - 1) / 2;
+ swap(base[b], base[c]);
+ }
+ }
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ bool *write_locked, size_t *fast)
{
- struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
+ EBUG_ON(!wb->journal_seq);
+ EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
- path = iter->path;
+ /*
+ * We can't clone a path that has write locks: unshare it now, before
+ * set_pos and traverse():
+ */
+ if (btree_iter_path(trans, iter)->ref > 1)
+ iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
+ path = btree_iter_path(trans, iter);
if (!*write_locked) {
ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
@@ -56,52 +154,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
*write_locked = true;
}
- if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
- goto trans_commit;
+ return wb_flush_one_slowpath(trans, iter, wb);
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
-
- if (path->ref > 1) {
- /*
- * We can't clone a path that has write locks: if the path is
- * shared, unlock before set_pos(), traverse():
- */
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- *write_locked = false;
- }
return 0;
-trans_commit:
- return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
- union btree_write_buffer_state old, new;
- u64 v = READ_ONCE(wb->state.v);
-
- do {
- old.v = new.v = v;
-
- new.nr = 0;
- new.idx++;
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
- while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
- cpu_relax();
-
- smp_mb();
-
- return old;
}
/*
@@ -124,41 +184,87 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ trans->journal_res.seq = wb->journal_seq;
+
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_update(trans, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
- bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+ struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+ struct journal *j = &c->journal;
+
+ if (!wb->inc.keys.nr)
+ return;
+
+ bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+ darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+ swap(wb->flushing.keys, wb->inc.keys);
+ goto out;
+ }
+
+ size_t nr = min(darray_room(wb->flushing.keys),
+ wb->sorted.size - wb->flushing.keys.nr);
+ nr = min(nr, wb->inc.keys.nr);
+
+ memcpy(&darray_top(wb->flushing.keys),
+ wb->inc.keys.data,
+ sizeof(wb->inc.keys.data[0]) * nr);
+
+ memmove(wb->inc.keys.data,
+ wb->inc.keys.data + nr,
+ sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+ wb->flushing.keys.nr += nr;
+ wb->inc.keys.nr -= nr;
+out:
+ if (!wb->inc.keys.nr)
+ bch2_journal_pin_drop(j, &wb->inc.pin);
+ else
+ bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ if (j->watermark) {
+ spin_lock(&j->lock);
+ bch2_journal_set_watermark(j);
+ spin_unlock(&j->lock);
+ }
+
+ BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct journal_entry_pin pin;
- struct btree_write_buffered_key *i, *keys;
struct btree_iter iter = { NULL };
- size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ size_t skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
- union btree_write_buffer_state s;
int ret = 0;
- memset(&pin, 0, sizeof(pin));
-
- if (!locked && !mutex_trylock(&wb->flush_lock))
- return 0;
-
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
- bch2_journal_pin_drop(j, &wb->journal_pin);
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
- s = btree_write_buffer_switch(wb);
- keys = wb->keys[s.idx];
- nr = s.nr;
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
+ mutex_unlock(&wb->inc.lock);
- if (race_fault())
- goto slowpath;
+ for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+ wb->sorted.data[i].idx = i;
+ wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+ memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+ }
+ wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
@@ -168,208 +274,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
- * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
* as many keys as possible in the fast path.
*/
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_key_cmp, NULL);
+ wb_sort(wb->sorted.data, wb->sorted.nr);
+
+ darray_for_each(wb->sorted, i) {
+ struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+ for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+ prefetch(&wb->flushing.keys.data[n->idx]);
+
+ BUG_ON(!k->journal_seq);
+
+ if (i + 1 < &darray_top(wb->sorted) &&
+ wb_key_eq(i, i + 1)) {
+ struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- for (i = keys; i < keys + nr; i++) {
- if (i + 1 < keys + nr &&
- i[0].btree == i[1].btree &&
- bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
- i->journal_seq = 0;
+ n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
+ k->journal_seq = 0;
continue;
}
- if (write_locked &&
- (iter.path->btree_id != i->btree ||
- bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
- write_locked = false;
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ if (path->btree_id != i->btree ||
+ bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ write_locked = false;
+ }
}
- if (!iter.path || iter.path->btree_id != i->btree) {
+ if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
- bch2_btree_iter_set_pos(&iter, i->k.k.p);
- iter.path->preserve = false;
+ bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ btree_iter_path(trans, &iter)->preserve = false;
do {
- ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
- commit_flags, &write_locked, &fast);
+ if (race_fault()) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ break;
+ }
+
+ ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ if (!ret) {
+ k->journal_seq = 0;
+ } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
- continue;
- }
- if (ret)
+ ret = 0;
+ } else
break;
-
- i->journal_seq = 0;
}
- if (write_locked)
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ }
bch2_trans_iter_exit(trans, &iter);
- trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
-
- if (slowpath)
- goto slowpath;
+ if (ret)
+ goto err;
+ if (slowpath) {
+ /*
+ * Flush in the order they were present in the journal, so that
+ * we can release journal pins:
+ * The fastpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+
+ darray_for_each(wb->flushing.keys, i) {
+ if (!i->journal_seq)
+ continue;
+
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ bch2_trans_begin(trans);
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
+ btree_write_buffered_insert(trans, i));
+ if (ret)
+ goto err;
+ }
+ }
+err:
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
- bch2_journal_pin_drop(j, &pin);
- mutex_unlock(&wb->flush_lock);
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
return ret;
-slowpath:
- trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+}
- /*
- * Now sort the rest by journal seq and bump the journal pin as we go.
- * The slowpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
- */
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_journal_cmp,
- NULL);
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+ struct journal *j = &c->journal;
+ struct journal_buf *buf;
+ int ret = 0;
- commit_flags &= ~BCH_WATERMARK_MASK;
- commit_flags |= BCH_WATERMARK_reclaim;
+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+ ret = bch2_journal_keys_to_write_buffer(c, buf);
+ mutex_unlock(&j->buf_lock);
+ }
- for (i = keys; i < keys + nr; i++) {
- if (!i->journal_seq)
- continue;
+ return ret;
+}
- if (i->journal_seq > pin.seq) {
- struct journal_entry_pin pin2;
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0, fetch_from_journal_err;
- memset(&pin2, 0, sizeof(pin2));
+ do {
+ bch2_trans_unlock(trans);
- bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin);
- bch2_journal_pin_copy(j, &pin, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin2);
- }
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
- ret = commit_do(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
- btree_write_buffered_insert(trans, i));
- if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
- break;
- }
+ /*
+ * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+ * is not guaranteed to empty wb->inc:
+ */
+ mutex_lock(&wb->flushing.lock);
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ } while (!ret &&
+ (fetch_from_journal_err ||
+ (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
- goto out;
+ return ret;
}
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
{
- bch2_trans_unlock(trans);
- mutex_lock(&trans->c->btree_write_buffer.flush_lock);
- return __bch2_btree_write_buffer_flush(trans, 0, true);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
}
-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
- return __bch2_btree_write_buffer_flush(trans, 0, false);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
+
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
}
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0;
- mutex_lock(&wb->flush_lock);
+ if (mutex_trylock(&wb->flushing.lock)) {
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ }
- return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+ return ret;
}
-static inline u64 btree_write_buffer_ref(int idx)
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
- return ((union btree_write_buffer_state) {
- .ref0 = idx == 0,
- .ref1 = idx == 1,
- }).v;
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+ return -BCH_ERR_erofs_no_writes;
+
+ int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ return ret;
}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key *i;
- union btree_write_buffer_state old, new;
- int ret = 0;
- u64 v;
+ int ret;
+
+ mutex_lock(&wb->flushing.lock);
+ do {
+ ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+ } while (!ret && bch2_btree_write_buffer_should_flush(c));
+ mutex_unlock(&wb->flushing.lock);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+}
- trans_for_each_wb_update(trans, i) {
- EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret;
+retry:
+ ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+ if (!ret && dst->wb == &wb->flushing)
+ ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (unlikely(ret)) {
+ if (dst->wb == &c->btree_write_buffer.flushing) {
+ mutex_unlock(&dst->wb->lock);
+ dst->wb = &c->btree_write_buffer.inc;
+ bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+ bch2_btree_write_buffer_journal_flush);
+ goto retry;
+ }
- i->journal_seq = trans->journal_res.seq;
- i->journal_offset = trans->journal_res.offset;
+ return ret;
}
- preempt_disable();
- v = READ_ONCE(wb->state.v);
- do {
- old.v = new.v = v;
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ BUG_ON(!dst->room);
+ BUG_ON(!dst->seq);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (mutex_trylock(&wb->flushing.lock)) {
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
- new.v += btree_write_buffer_ref(new.idx);
- new.nr += trans->nr_wb_updates;
- if (new.nr > wb->size) {
- ret = -BCH_ERR_btree_insert_need_flush_buffer;
- goto out;
+ /*
+ * Attempt to skip wb->inc, and add keys directly to
+ * wb->flushing, saving us a copy later:
+ */
+
+ if (!wb->inc.keys.nr) {
+ dst->wb = &wb->flushing;
+ } else {
+ mutex_unlock(&wb->flushing.lock);
+ dst->wb = &wb->inc;
}
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+ } else {
+ mutex_lock(&wb->inc.lock);
+ dst->wb = &wb->inc;
+ }
- memcpy(wb->keys[new.idx] + old.nr,
- trans->wb_updates,
- sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ dst->seq = seq;
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+}
- atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (!dst->wb->keys.nr)
+ bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+ if (bch2_btree_write_buffer_should_flush(c) &&
+ __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+ if (dst->wb == &wb->flushing)
+ mutex_unlock(&wb->flushing.lock);
+ mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
+
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+
+ buf->need_flush_to_write_buffer = false;
out:
- preempt_enable();
+ bch2_journal_keys_to_write_buffer_end(c, &dst);
+ return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+ if (wb->keys.size >= new_size)
+ return 0;
+
+ if (!mutex_trylock(&wb->lock))
+ return -EINTR;
+
+ int ret = darray_resize(&wb->keys, new_size);
+ mutex_unlock(&wb->lock);
return ret;
}
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb_keys_resize(&wb->flushing, new_size) ?:
+ wb_keys_resize(&wb->inc, new_size);
+}
+
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+ BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+ !bch2_journal_error(&c->journal));
- kvfree(wb->keys[1]);
- kvfree(wb->keys[0]);
+ darray_exit(&wb->sorted);
+ darray_exit(&wb->flushing.keys);
+ darray_exit(&wb->inc.keys);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- mutex_init(&wb->flush_lock);
- wb->size = c->opts.btree_write_buffer_size;
+ mutex_init(&wb->inc.lock);
+ mutex_init(&wb->flushing.lock);
+ INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
- wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
- wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
- if (!wb->keys[0] || !wb->keys[1])
- return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+ /* Will be resized by journal as needed: */
+ unsigned initial_size = 1 << 16;
- return 0;
+ return darray_make_room(&wb->inc.keys, initial_size) ?:
+ darray_make_room(&wb->flushing.keys, initial_size) ?:
+ darray_make_room(&wb->sorted, initial_size);
}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 322df1c830..eebcd2b152 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -2,12 +2,59 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct journal_keys_to_wb {
+ struct btree_write_buffer_keys *wb;
+ size_t room;
+ u64 seq;
+};
+
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
+ struct journal_keys_to_wb *,
+ enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ if (unlikely(!dst->room))
+ return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 99993ba77a..9b9433de9c 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -2,43 +2,56 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-struct btree_write_buffered_key {
- u64 journal_seq;
- unsigned journal_offset;
- enum btree_id btree;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-union btree_write_buffer_state {
+struct wb_key_ref {
+union {
struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned idx:24;
+ u8 pos[sizeof(struct bpos)];
+ enum btree_id btree:8;
+#else
+ enum btree_id btree:8;
+ u8 pos[sizeof(struct bpos)];
+ unsigned idx:24;
+#endif
+ } __packed;
struct {
- u64 nr:23;
- u64 idx:1;
- u64 ref0:20;
- u64 ref1:20;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ u64 lo;
+ u64 mi;
+ u64 hi;
+#else
+ u64 hi;
+ u64 mi;
+ u64 lo;
+#endif
};
};
+};
-struct btree_write_buffer {
- struct mutex flush_lock;
- struct journal_entry_pin journal_pin;
+struct btree_write_buffered_key {
+ enum btree_id btree:8;
+ u64 journal_seq:56;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
- union btree_write_buffer_state state;
- size_t size;
+struct btree_write_buffer_keys {
+ DARRAY(struct btree_write_buffered_key) keys;
+ struct journal_entry_pin pin;
+ struct mutex lock;
+};
- struct btree_write_buffered_key *keys[2];
+struct btree_write_buffer {
+ DARRAY(struct wb_key_ref) sorted;
+ struct btree_write_buffer_keys inc;
+ struct btree_write_buffer_keys flushing;
+ struct work_struct flush_work;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5a91d3189f..54f7826ac4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -25,7 +25,7 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@@ -47,31 +47,27 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
void bch2_fs_usage_initialize(struct bch_fs *c)
{
- struct bch_fs_usage *usage;
- struct bch_dev *ca;
- unsigned i;
-
percpu_down_write(&c->mark_lock);
- usage = c->usage_base;
+ struct bch_fs_usage *usage = c->usage_base;
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->reserved += usage->persistent_reserved[i];
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
+ usage->b.reserved += usage->persistent_reserved[i];
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
+ usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@@ -158,8 +154,7 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- struct bch_dev *ca;
- unsigned i, u64s = fs_usage_u64s(c);
+ unsigned u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
+ for_each_member_device_rcu(c, ca, NULL) {
u64s = dev_usage_u64s();
acc_u64s_percpu((u64 *) ca->usage_base,
@@ -193,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.hidden);
+ fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.data);
+ fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.cached);
+ fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.reserved);
+ fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.nr_inodes);
+ fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
prt_printf(out, "\t");
@@ -230,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->u.hidden +
- fs_usage->u.btree +
- fs_usage->u.data +
- reserve_factor(fs_usage->u.reserved +
+ return min(fs_usage->u.b.hidden +
+ fs_usage->u.b.btree +
+ fs_usage->u.b.data +
+ reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -245,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
- bch2_fs_usage_read_one(c, &c->usage_base->btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca)
ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
}
-static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bch_alloc_v4 a)
+void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
- return a.dirty_sectors
- ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
- : 0;
+ prt_tab(out);
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+ prt_str(out, "sectors");
+ prt_tab_rjust(out);
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
+ bch2_prt_data_type(out, i);
+ prt_tab(out);
+ prt_u64(out, usage->d[i].buckets);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].sectors);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].fragmented);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
}
-static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bch_alloc_v4 old,
- struct bch_alloc_v4 new,
- u64 journal_seq, bool gc)
+void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_alloc_v4 *old,
+ const struct bch_alloc_v4 *new,
+ u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
@@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- if (data_type_is_hidden(old.data_type))
- fs_usage->hidden -= ca->mi.bucket_size;
- if (data_type_is_hidden(new.data_type))
- fs_usage->hidden += ca->mi.bucket_size;
+ if (data_type_is_hidden(old->data_type))
+ fs_usage->b.hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new->data_type))
+ fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
- u->d[old.data_type].buckets--;
- u->d[new.data_type].buckets++;
-
- u->buckets_ec -= (int) !!old.stripe;
- u->buckets_ec += (int) !!new.stripe;
+ u->d[old->data_type].buckets--;
+ u->d[new->data_type].buckets++;
- u->d[old.data_type].sectors -= old.dirty_sectors;
- u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
+ u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
- u->d[BCH_DATA_cached].sectors += new.cached_sectors;
- u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+ u->d[BCH_DATA_cached].sectors += new->cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
- u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
- u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+ u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
+ u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
preempt_enable();
}
-static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket old, struct bucket new,
- u64 journal_seq, bool gc)
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
- struct bch_alloc_v4 old_a = {
- .gen = old.gen,
- .data_type = old.data_type,
- .dirty_sectors = old.dirty_sectors,
- .cached_sectors = old.cached_sectors,
- .stripe = old.stripe,
- };
- struct bch_alloc_v4 new_a = {
- .gen = new.gen,
- .data_type = new.data_type,
- .dirty_sectors = new.dirty_sectors,
- .cached_sectors = new.cached_sectors,
- .stripe = new.stripe,
+ return (struct bch_alloc_v4) {
+ .gen = b.gen,
+ .data_type = b.data_type,
+ .dirty_sectors = b.dirty_sectors,
+ .cached_sectors = b.cached_sectors,
+ .stripe = b.stripe,
};
+}
- bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *old, struct bucket *new)
+{
+ struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
+ struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
+
+ bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
}
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
@@ -353,14 +359,14 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
-static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry *r, s64 sectors,
- unsigned journal_seq, bool gc)
+int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_replicas_entry_v1 *r, s64 sectors,
+ unsigned journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
int idx, ret = 0;
@@ -388,7 +394,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+ return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
}
static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
@@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
__replicas_deltas_realloc(trans, more, _gfp));
}
-static inline int update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry *r,
- s64 sectors)
+int bch2_update_replicas_list(struct btree_trans *trans,
+ struct bch_replicas_entry_v1 *r,
+ s64 sectors)
{
struct replicas_delta_list *d;
struct replicas_delta *n;
@@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
return 0;
}
-static inline int update_cached_sectors_list(struct btree_trans *trans,
- unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas_list(trans, &r.e, sectors);
-}
-
-int bch2_mark_alloc(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- u64 bucket_journal_seq;
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, new_a_convert;
- const struct bch_alloc_v4 *old_a, *new_a;
- struct bch_dev *ca;
- int ret = 0;
-
- /*
- * alloc btree is read in by bch2_alloc_read, not gc:
- */
- if ((flags & BTREE_TRIGGER_GC) &&
- !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
- return 0;
-
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
- return -EIO;
-
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = bch2_alloc_to_v4(new, &new_a_convert);
-
- bucket_journal_seq = new_a->journal_seq;
-
- if ((flags & BTREE_TRIGGER_INSERT) &&
- data_type_is_empty(old_a->data_type) !=
- data_type_is_empty(new_a->data_type) &&
- new.k->type == KEY_TYPE_alloc_v4) {
- struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
-
- EBUG_ON(!journal_seq);
-
- /*
- * If the btree updates referring to a bucket weren't flushed
- * before the bucket became empty again, then the we don't have
- * to wait on a journal flush before we can reuse the bucket:
- */
- v->journal_seq = bucket_journal_seq =
- data_type_is_empty(new_a->data_type) &&
- (journal_seq == v->journal_seq ||
- bch2_journal_noflush_seq(&c->journal, v->journal_seq))
- ? 0 : journal_seq;
- }
-
- if (!data_type_is_empty(old_a->data_type) &&
- data_type_is_empty(new_a->data_type) &&
- bucket_journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- bucket_journal_seq);
- if (ret) {
- bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
- return ret;
- }
- }
-
- percpu_down_read(&c->mark_lock);
- if (!gc && new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
-
- bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
-
- if (gc) {
- struct bucket *g = gc_bucket(ca, new.k->p.offset);
-
- bucket_lock(g);
-
- g->gen_valid = 1;
- g->gen = new_a->gen;
- g->data_type = new_a->data_type;
- g->stripe = new_a->stripe;
- g->stripe_redundancy = new_a->stripe_redundancy;
- g->dirty_sectors = new_a->dirty_sectors;
- g->cached_sectors = new_a->cached_sectors;
-
- bucket_unlock(g);
- }
- percpu_up_read(&c->mark_lock);
-
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_a->cached_sectors) {
- ret = update_cached_sectors(c, new, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- journal_seq, gc);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
-
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
- closure_wake_up(&c->freelist_wait);
-
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
-
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
-
- if (new_a->data_type == BCH_DATA_need_gc_gens)
- bch2_do_gc_gens(c);
-
- return 0;
+ return bch2_update_replicas_list(trans, &r.e, sectors);
}
int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -643,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
- bch2_data_types[g->data_type],
- bch2_data_types[data_type])) {
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@@ -652,37 +532,33 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
- bch2_data_types[g->data_type ?: data_type],
+ bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
}
-
g->data_type = data_type;
g->dirty_sectors += sectors;
new = *g;
err:
bucket_unlock(g);
if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
percpu_up_read(&c->mark_lock);
return ret;
}
-static int check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 dirty_sectors, u32 cached_sectors)
+int bch2_check_bucket_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 bucket_sectors)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- u32 bucket_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -699,7 +575,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -712,7 +588,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -727,7 +603,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -748,8 +624,8 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type],
- bch2_data_types[ptr_data_type],
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -762,7 +638,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -777,508 +653,6 @@ err:
goto out;
}
-static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- u64 journal_seq = trans->journal_res.seq;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- /* * XXX doesn't handle deletion */
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
-
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
- goto err;
- }
-
- bucket_lock(g);
- old = *g;
-
- ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors, g->cached_sectors);
- if (ret)
- goto err;
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
-
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
-err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
-{
- u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type,
- *dirty_sectors, *cached_sectors);
-
- if (ret)
- return ret;
-
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
- return 0;
-}
-
-static int bch2_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- s64 sectors,
- unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket old, new, *g;
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
- u8 bucket_data_type;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, &p.ptr);
- bucket_lock(g);
- old = *g;
-
- bucket_data_type = g->data_type;
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (!ret)
- g->data_type = bucket_data_type;
-
- new = *g;
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-static int bch2_mark_stripe_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bch_extent_stripe_ptr p,
- enum bch_data_type data_type,
- s64 sectors,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_replicas_padded r;
- struct gc_stripe *m;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- (u64) p.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
- }
-
- mutex_lock(&c->ec_stripes_heap_lock);
-
- if (!m || !m->alive) {
- mutex_unlock(&c->ec_stripes_heap_lock);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- bch2_inconsistent_error(c);
- return -EIO;
- }
-
- m->block_sectors[p.block] += sectors;
-
- r = m->r;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- r.e.data_type = data_type;
- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
-
- return 0;
-}
-
-static int __mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_replicas_padded r;
- enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
- ? BCH_DATA_btree
- : BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
- s64 dirty_sectors = 0;
- bool stale;
- int ret;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- r.e.data_type = data_type;
- r.e.nr_devs = 0;
- r.e.nr_required = 1;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
- if (ret < 0)
- return ret;
-
- stale = ret > 0;
-
- if (p.ptr.cached) {
- if (!stale) {
- ret = update_cached_sectors(c, k, p.ptr.dev,
- disk_sectors, journal_seq, true);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
- } else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
- r.e.devs[r.e.nr_devs++] = p.ptr.dev;
- } else {
- ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
- disk_sectors, flags);
- if (ret)
- return ret;
-
- /*
- * There may be other dirty pointers in this extent, but
- * if so they're not required for mounting if we have an
- * erasure coded pointer in this extent:
- */
- r.e.nr_required = 0;
- }
- }
-
- if (r.e.nr_devs) {
- ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-int bch2_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- u64 idx = new.k->p.offset;
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
- unsigned i;
- int ret;
-
- BUG_ON(gc && old_s);
-
- if (!gc) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- } else {
- struct gc_stripe *m =
- genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- */
- m->alive = true;
- m->sectors = le16_to_cpu(new_s->sectors);
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->ptrs[i] = new_s->ptrs[i];
-
- bch2_bkey_to_replicas(&m->r.e, new);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
-
- ret = update_replicas(c, new, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- journal_seq, gc);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-static int __mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
- sectors *= replicas;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
-
- fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(fs_usage->persistent_reserved));
-
- fs_usage->reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-
- return 0;
-}
-
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- u64 *idx, unsigned flags, size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- u64 next_idx = end;
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i_error *new;
-
- new = bch2_trans_kmalloc(trans, sizeof(*new));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- bkey_init(&new->k);
- new->k.type = KEY_TYPE_error;
- new->k.p = bkey_start_pos(p.k);
- new->k.p.offset += *idx - start;
- bch2_key_resize(&new->k, next_idx - *idx);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
- BTREE_TRIGGER_NORUN);
- }
-
- *idx = next_idx;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- struct reflink_gc *ref;
- size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx), start = idx;
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
- idx -= le32_to_cpu(p.v->front_pad);
- end += le32_to_cpu(p.v->back_pad);
- }
-
- l = 0;
- r = c->reflink_gc_nr;
- while (l < r) {
- m = l + (r - l) / 2;
-
- ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(trans, p, start, end,
- &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1303,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
- dst->nr_inodes -= deltas->nr_inodes;
+ dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
- dst->reserved -= deltas->persistent_reserved[i];
+ dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@@ -1320,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- s64 added = 0, should_not_have_added;
- unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+ struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
-
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
- }
-
- dst->nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added += deltas->persistent_reserved[i];
- dst->reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
+ s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
- should_not_have_added = added - (s64) disk_res_sectors;
+ s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@@ -1380,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
+ dst->hidden += src->hidden;
+ dst->btree += src->btree;
+ dst->data += src->data;
+ dst->cached += src->cached;
+ dst->reserved += src->reserved;
+ dst->nr_inodes += src->nr_inodes;
+
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -1387,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d))
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+
+ dst->b.nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ dst->b.reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@@ -1398,92 +784,184 @@ need_mark:
return -1;
}
-/* trans_mark: */
+/* KEY_TYPE_extent: */
+
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 *bucket_data_type,
+ u32 *dirty_sectors, u32 *cached_sectors)
+{
+ u32 *dst_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+ bucket_gen, *bucket_data_type, *dst_sectors);
+
+ if (ret)
+ return ret;
+
+ *dst_sectors += sectors;
+
+ if (!*dirty_sectors && !*cached_sectors)
+ *bucket_data_type = 0;
+ else if (*bucket_data_type != BCH_DATA_stripe)
+ *bucket_data_type = ptr_data_type;
+
+ return 0;
+}
-static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- unsigned flags)
+static int bch2_trigger_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ s64 *sectors,
+ unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
struct bpos bucket;
struct bch_backpointer bp;
- s64 sectors;
- int ret;
bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
- sectors = bp.bucket_len;
- if (!insert)
- sectors = -sectors;
+ *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
- a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
+ ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+ bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- return ret;
-
- if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
if (ret)
return ret;
+
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ bucket_lock(g);
+ struct bucket old = *g;
+
+ u8 bucket_data_type = g->data_type;
+ int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (ret) {
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+ }
+
+ g->data_type = bucket_data_type;
+ struct bucket new = *g;
+ bucket_unlock(g);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
}
return 0;
}
-static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
- struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ enum bch_data_type data_type,
+ s64 sectors, unsigned flags)
{
- struct btree_iter iter;
- struct bkey_i_stripe *s;
- struct bch_replicas_padded r;
- int ret = 0;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_WITH_UPDATES, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx);
+ goto err;
+ }
- s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
- ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
- "pointer to nonexistent stripe %llu",
- (u64) p.ec.idx);
- goto err;
- }
+ if (!bch2_ptr_matches_stripe(&s->v, p)) {
+ bch2_trans_inconsistent(trans,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto err;
+ }
- if (!bch2_ptr_matches_stripe(&s->v, p)) {
- bch2_trans_inconsistent(trans,
- "stripe pointer doesn't match stripe %llu",
- (u64) p.ec.idx);
- ret = -EIO;
- goto err;
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
+ sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+ r.e.data_type = data_type;
+ ret = bch2_update_replicas_list(trans, &r.e, sectors);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
- stripe_blockcount_set(&s->v, p.ec.block,
- stripe_blockcount_get(&s->v, p.ec.block) +
- sectors);
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
- r.e.data_type = data_type;
- ret = update_replicas_list(trans, &r.e, sectors);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.ec.idx);
+ return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ }
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+
+ if (!m || !m->alive) {
+ mutex_unlock(&c->ec_stripes_heap_lock);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
+ (u64) p.ec.idx, buf.buf);
+ printbuf_exit(&buf);
+ bch2_inconsistent_error(c);
+ return -EIO;
+ }
+
+ m->block_sectors[p.ec.block] += sectors;
+
+ struct bch_replicas_padded r = m->r;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ r.e.data_type = data_type;
+ bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+ }
+
+ return 0;
}
-static int __trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+static int __trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1492,11 +970,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
s64 dirty_sectors = 0;
- bool stale;
int ret = 0;
r.e.data_type = data_type;
@@ -1504,21 +978,20 @@ static int __trans_mark_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+ s64 disk_sectors;
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
if (ret < 0)
return ret;
- stale = ret > 0;
+ bool stale = ret > 0;
if (p.ptr.cached) {
if (!stale) {
- ret = update_cached_sectors_list(trans, p.ptr.dev,
- disk_sectors);
+ ret = !gc
+ ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
+ : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
if (ret)
return ret;
}
@@ -1526,324 +999,122 @@ static int __trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_trans_mark_stripe_ptr(trans, p,
- disk_sectors, data_type);
+ ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
return ret;
+ /*
+ * There may be other dirty pointers in this extent, but
+ * if so they're not required for mounting if we have an
+ * erasure coded pointer in this extent:
+ */
r.e.nr_required = 0;
}
}
- if (r.e.nr_devs)
- ret = update_replicas_list(trans, &r.e, dirty_sectors);
-
- return ret;
-}
-
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (r.e.nr_devs) {
+ ret = !gc
+ ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
+ : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
+ if (unlikely(ret && gc)) {
+ struct printbuf buf = PRINTBUF;
- if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
if (ret)
return ret;
}
- return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
-{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
- int ret = 0;
-
- if (deleting)
- sectors = -sectors;
-
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors, a->v.cached_sectors);
- if (ret)
- goto err;
-
- if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
- } else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
- }
-
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-int bch2_trans_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bch_stripe *old_s = NULL;
- struct bch_stripe *new_s = NULL;
- struct bch_replicas_padded r;
- unsigned i, nr_blocks;
- int ret = 0;
-
- if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old).v;
- if (new->k.type == KEY_TYPE_stripe)
- new_s = &bkey_i_to_stripe(new)->v;
-
- /*
- * If the pointers aren't changing, we don't need to do anything:
- */
- if (new_s && old_s &&
- new_s->nr_blocks == old_s->nr_blocks &&
- new_s->nr_redundant == old_s->nr_redundant &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
+ struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
+ unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
+ unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+
+ /* if pointers aren't changing - nothing to do: */
+ if (new_ptrs_bytes == old_ptrs_bytes &&
+ !memcmp(new_ptrs.start,
+ old_ptrs.start,
+ new_ptrs_bytes))
return 0;
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
+ (int) bch2_bkey_needs_rebalance(c, old);
- nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
- if (new_s) {
- s64 sectors = le16_to_cpu(new_s->sectors);
-
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
- ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
-
- bch2_bkey_to_replicas(&r.e, old);
- ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_i_to_s_c_stripe(new), i, false);
- if (ret)
- break;
- }
-
- if (old_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
+ return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+
+ return 0;
}
-static int __trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+/* KEY_TYPE_reservation */
+
+static int __trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ struct bch_fs *c = trans->c;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
- struct replicas_delta_list *d;
- int ret;
+ s64 sectors = (s64) k.k->size * replicas;
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
- sectors *= replicas;
-
- ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
- d = trans->fs_usage_deltas;
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->persistent_reserved));
-
- d->persistent_reserved[replicas - 1] += sectors;
- return 0;
-}
-
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static int trans_mark_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- struct printbuf buf = PRINTBUF;
- int ret;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
+ replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
- refcount = bkey_refcount(k);
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ d->persistent_reserved[replicas - 1] += sectors;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
- }
+ if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
+ struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
+ replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
+ fs_usage->b.reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
}
- le64_add_cpu(refcount, add);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
- if (ret)
- goto err;
-
- *idx = k->k.p.offset;
-err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
+ return 0;
}
-static int __trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+int bch2_trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx, end_idx;
- int ret = 0;
-
- idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- end_idx = le64_to_cpu(p.v->idx) + p.k->size +
- le32_to_cpu(p.v->back_pad);
-
- while (idx < end_idx && !ret)
- ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
- return ret;
+ return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
+/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
@@ -1871,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- bch2_data_types[type],
- bch2_data_types[type]);
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
ret = -EIO;
goto err;
}
@@ -1974,17 +1245,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_trans_mark_dev_sbs(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 21f6cb3569..6387e039f7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
}
void bch2_dev_usage_init(struct bch_dev *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
+void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
+ const struct bch_alloc_v4 *,
+ const struct bch_alloc_v4 *, u64, bool);
+void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
+ struct bucket *, struct bucket *);
+
/* key/bucket marking: */
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -315,43 +322,41 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
+int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
+ struct bch_replicas_entry_v1 *, s64,
+ unsigned, bool);
+int bch2_update_replicas_list(struct btree_trans *,
+ struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
+int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
+ const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32);
+
int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-
-int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-
-#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
ret; \
})
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
- mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@@ -382,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+ return type < BCH_DATA_NR
+ ? __bch2_data_types[type]
+ : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+ if (type < BCH_DATA_NR)
+ prt_str(out, __bch2_data_types[type]);
+ else
+ prt_printf(out, "(invalid data type %u)", type);
+}
+
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 2a9dab9006..6a31740222 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -33,8 +33,6 @@ struct bucket_gens {
};
struct bch_dev_usage {
- u64 buckets_ec;
-
struct {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
@@ -47,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
+};
- /* XXX: add stats for compression ratio */
-#if 0
- u64 uncompressed;
- u64 compressed;
-#endif
-
- /* broken out: */
-
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 64000c8da5..226b39c176 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,22 +7,27 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
+#include "recovery.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
+#include "thread_with_file.h"
-#include <linux/anon_inodes.h>
#include <linux/cdev.h>
#include <linux/device.h>
-#include <linux/file.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
-#include <linux/kthread.h>
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+__must_check
+static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
+struct fsck_thread {
+ struct thread_with_stdio thr;
+ struct bch_fs *c;
+ char **devs;
+ size_t nr_devs;
+ struct bch_opts opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+ if (thr->devs)
+ for (size_t i = 0; i < thr->nr_devs; i++)
+ kfree(thr->devs[i]);
+ kfree(thr->devs);
+ kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+
+ thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
+ if (!thr->thr.thr.ret)
+ bch2_fs_stop(c);
+
+ thread_with_stdio_done(&thr->thr);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ u64 *devs = NULL;
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
+ !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
+ !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->opts = bch2_opts_empty();
+ thr->nr_devs = arg.nr_devs;
+
+ if (copy_from_user(devs, &user_arg->devs[0],
+ array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(thr->devs[i]);
+ if (ret)
+ goto err;
+ }
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_offline_thread_fn);
+err:
+ if (ret < 0) {
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ }
+ kfree(devs);
+ return ret;
+}
+
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
+ long ret;
+
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
@@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
+ case BCH_IOCTL_FSCK_OFFLINE: {
+ ret = bch2_ioctl_fsck_offline(arg);
+ break;
+ }
default:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
}
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
}
static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
- if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
- sizeof(c->sb.user_uuid)))
- return -EFAULT;
- return 0;
+ return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid));
}
#if 0
@@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
}
struct bch_data_ctx {
+ struct thread_with_file thr;
+
struct bch_fs *c;
struct bch_ioctl_data arg;
struct bch_move_stats stats;
-
- int ret;
-
- struct task_struct *thread;
};
static int bch2_data_thread(void *arg)
{
- struct bch_data_ctx *ctx = arg;
-
- ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+ struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
+ ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.data_type = U8_MAX;
return 0;
}
static int bch2_data_job_release(struct inode *inode, struct file *file)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- kthread_stop(ctx->thread);
- put_task_struct(ctx->thread);
+ bch2_thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
@@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (len < sizeof(e))
return -EINVAL;
- if (copy_to_user(buf, &e, sizeof(e)))
- return -EFAULT;
-
- return sizeof(e);
+ return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
@@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = {
static long bch2_ioctl_data(struct bch_fs *c,
struct bch_ioctl_data arg)
{
- struct bch_data_ctx *ctx = NULL;
- struct file *file = NULL;
- unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
- int ret, fd = -1;
+ struct bch_data_ctx *ctx;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -374,35 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx,
- "bch-data/%s", c->name);
- if (IS_ERR(ctx->thread)) {
- ret = PTR_ERR(ctx->thread);
- goto err;
- }
-
- ret = get_unused_fd_flags(flags);
+ ret = bch2_run_thread_with_file(&ctx->thr,
+ &bcachefs_data_ops,
+ bch2_data_thread);
if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- get_task_struct(ctx->thread);
- wake_up_process(ctx->thread);
- fd_install(fd, file);
-
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (!IS_ERR_OR_NULL(ctx->thread))
- kthread_stop(ctx->thread);
- kfree(ctx);
+ kfree(ctx);
return ret;
}
@@ -416,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
unsigned i;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
@@ -443,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
dst_end = (void *) arg->replicas + replica_entries_bytes;
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *src_e =
+ struct bch_replicas_entry_v1 *src_e =
cpu_replicas_entry(&c->replicas, i);
/* check that we have enough space for one replicas entry */
@@ -473,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (ret)
goto err;
- if (copy_to_user(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes))
- ret = -EFAULT;
+
+ ret = copy_to_user_errcode(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes);
err:
kfree(arg);
return ret;
}
+/* obsolete, didn't allow for new data types: */
static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
@@ -489,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_dev *ca;
unsigned i;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -510,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) {
arg.d[i].buckets = src.d[i].buckets;
@@ -520,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
percpu_ref_put(&ca->ref);
- if (copy_to_user(user_arg, &arg, sizeof(arg)))
+ return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+}
+
+static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
+ struct bch_ioctl_dev_usage_v2 __user *user_arg)
+{
+ struct bch_ioctl_dev_usage_v2 arg;
+ struct bch_dev_usage src;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
- return 0;
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad[0] ||
+ arg.pad[1] ||
+ arg.pad[2])
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ src = bch2_dev_usage_read(ca);
+
+ arg.state = ca->mi.state;
+ arg.bucket_size = ca->mi.bucket_size;
+ arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+ if (ret)
+ goto err;
+
+ for (unsigned i = 0; i < arg.nr_data_types; i++) {
+ struct bch_ioctl_dev_usage_type t = {
+ .buckets = src.d[i].buckets,
+ .sectors = src.d[i].sectors,
+ .fragmented = src.d[i].fragmented,
+ };
+
+ ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
+ if (ret)
+ goto err;
+ }
+err:
+ percpu_ref_put(&ca->ref);
+ return ret;
}
static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -560,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
goto err;
}
- if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
- vstruct_bytes(sb)))
- ret = -EFAULT;
+ ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
+ vstruct_bytes(sb));
err:
if (!IS_ERR_OR_NULL(ca))
percpu_ref_put(&ca->ref);
@@ -574,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
struct bch_ioctl_disk_get_idx arg)
{
dev_t dev = huge_decode_dev(arg.dev);
- struct bch_dev *ca;
- unsigned i;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -583,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
- return i;
+ return ca->dev_idx;
}
return -BCH_ERR_ENOENT_dev_idx_not_found;
@@ -641,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
+static int bch2_fsck_online_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ c->stdio_filter = current;
+ c->stdio = &thr->thr.stdio;
+
+ /*
+ * XXX: can we figure out a way to do this without mucking with c->opts?
+ */
+ unsigned old_fix_errors = c->opts.fix_errors;
+ if (opt_defined(thr->opts, fix_errors))
+ c->opts.fix_errors = thr->opts.fix_errors;
+ else
+ c->opts.fix_errors = FSCK_FIX_ask;
+
+ c->opts.fsck = true;
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+ int ret = bch2_run_online_recovery_passes(c);
+
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+ bch_err_fn(c, ret);
+
+ c->stdio = NULL;
+ c->stdio_filter = NULL;
+ c->opts.fix_errors = old_fix_errors;
+
+ thread_with_stdio_done(&thr->thr);
+
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_online(struct bch_fs *c,
+ struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bch2_ro_ref_tryget(c))
+ return -EROFS;
+
+ if (down_trylock(&c->online_fsck_mutex)) {
+ bch2_ro_ref_put(c);
+ return -EAGAIN;
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->c = c;
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(c, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_online_thread_fn);
+err:
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ }
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -662,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return bch2_ioctl_fs_usage(c, arg);
case BCH_IOCTL_DEV_USAGE:
return bch2_ioctl_dev_usage(c, arg);
+ case BCH_IOCTL_DEV_USAGE_V2:
+ return bch2_ioctl_dev_usage_v2(c, arg);
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
@@ -674,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
}
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
switch (cmd) {
@@ -694,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-
+ case BCH_IOCTL_FSCK_ONLINE:
+ BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 13998388c5..1b8c2c1016 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
})
+static inline void bch2_csum_to_text(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum csum)
+{
+ const u8 *p = (u8 *) &csum;
+ unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
+
+ for (unsigned i = 0; i < bytes; i++)
+ prt_hex_byte(out, p[i]);
+}
+
+static inline void bch2_csum_err_msg(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum expected,
+ struct bch_csum got)
+{
+ prt_printf(out, "checksum error: got ");
+ bch2_csum_to_text(out, type, got);
+ prt_str(out, " should be ");
+ bch2_csum_to_text(out, type, expected);
+ prt_printf(out, " type %s", bch2_csum_types[type]);
+}
+
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 51af8ea230..33df8cf86b 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
- /*
- * ZSTD is lying: if we allocate the size of the workspace it says it
- * requires, it returns memory allocation errors
- */
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
struct {
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232..58c2eb4557 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+ if (type < BCH_COMPRESSION_TYPE_NR)
+ prt_str(out, __bch2_compression_types[type]);
+ else
+ prt_printf(out, "(invalid compression type %u)", type);
+}
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index e367c625f0..4b340d13ca 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -20,6 +20,7 @@ struct { \
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
+typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
@@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+#define __darray_for_each(_d, _i) \
+ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
- for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
#define darray_init(_d) \
do { \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 37d6ecae8c..4150feca42 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -267,19 +267,31 @@ restart_drop_extra_replicas:
goto out;
}
+ if (trace_data_update_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
@@ -300,14 +312,14 @@ next:
}
continue;
nowork:
- if (m->stats && m->stats) {
+ if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->stats->sectors_raced);
}
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+ count_event(c, move_extent_fail);
bch2_btree_iter_advance(&iter);
goto next;
@@ -342,7 +354,6 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_ptrs_c ptrs =
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr) {
if (c->opts.nocow_enabled)
@@ -363,7 +374,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
@@ -404,6 +414,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
continue;
}
+ bch_err_fn_ratelimited(c, ret);
+
if (ret)
return;
@@ -476,7 +488,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_data_update_init(struct btree_trans *trans,
@@ -493,7 +505,6 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
int ret = 0;
@@ -516,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
@@ -639,7 +650,6 @@ done:
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 57c5128db1..7bdba8507f 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_bytes(c)),
+ buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_bytes(c));
+ bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
+ memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_bytes(c)),
+ buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+ bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_bytes(c));
+ kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}
@@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- bch2_bkey_val_to_text(&i->buf, i->c, k);
- prt_newline(&i->buf);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations btree_debug_ops = {
@@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
-
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- struct btree_path_level *l = &iter.path->l[0];
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek(&l->iter, l->b);
-
- if (bpos_gt(l->b->key.k.p, i->prev_node)) {
- bch2_btree_node_to_text(&i->buf, i->c, l->b);
- i->prev_node = l->b->key.k.p;
- }
-
- bch2_bfloat_to_text(&i->buf, l->b, _k);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ struct btree_path_level *l =
+ &btree_iter_path(trans, &iter)->l[0];
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+
+ if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
+ }
+
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations bfloat_failed_debug_ops = {
@@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -650,11 +627,11 @@ restart:
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
.release = bch2_dump_release,
.read = bch2_btree_transactions_read,
};
-#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
@@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
-static int lock_held_stats_open(struct inode *inode, struct file *file)
+static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
@@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
if (!i)
return -ENOMEM;
- i->iter = 0;
+ i->iter = 1;
i->c = c;
i->buf = PRINTBUF;
file->private_data = i;
@@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
return 0;
}
-static int lock_held_stats_release(struct inode *inode, struct file *file)
+static int btree_transaction_stats_release(struct inode *inode, struct file *file)
{
struct dump_iter *i = file->private_data;
@@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
@@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
prt_newline(&i->buf);
+ prt_printf(&i->buf, "Transaction duration:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->duration);
+ printbuf_indent_sub(&i->buf, 2);
+
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
prt_printf(&i->buf, "Lock hold times:");
prt_newline(&i->buf);
@@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
return i->ret;
}
-static const struct file_operations lock_held_stats_op = {
- .owner = THIS_MODULE,
- .open = lock_held_stats_open,
- .release = lock_held_stats_release,
- .read = lock_held_stats_read,
+static const struct file_operations btree_transaction_stats_op = {
+ .owner = THIS_MODULE,
+ .open = btree_transaction_stats_open,
+ .release = btree_transaction_stats_release,
+ .read = btree_transaction_stats_read,
};
static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
@@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -850,7 +835,7 @@ restart:
bch2_check_for_deadlock(trans, &i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
c->btree_debug, &cached_btree_nodes_ops);
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
c->btree_debug, &btree_transactions_ops);
-#endif
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
- c, &lock_held_stats_op);
+ c, &btree_transaction_stats_op);
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
c->btree_debug, &btree_deadlock_ops);
@@ -947,8 +930,6 @@ void bch2_debug_exit(void)
int __init bch2_debug_init(void)
{
- int ret = 0;
-
bch_debug = debugfs_create_dir("bcachefs", NULL);
- return ret;
+ return 0;
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 2bfff0da70..4ae1e9f002 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr *r_name = _r;
- return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+ return !qstr_eq(l_name, *r_name);
}
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr r_name = bch2_dirent_get_name(r);
- return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+ return !qstr_eq(l_name, r_name);
}
static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
+int bch2_dirent_create_snapshot(struct btree_trans *trans,
+ u64 dir, u32 snapshot,
+ const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
+{
+ subvol_inum zero_inum = { 0 };
+ struct bkey_i_dirent *dirent;
+ int ret;
+
+ dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ ret = PTR_ERR_OR_ZERO(dirent);
+ if (ret)
+ return ret;
+
+ dirent->k.p.inode = dir;
+ dirent->k.p.snapshot = snapshot;
+
+ ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ zero_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
+}
+
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset, int flags)
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
+ dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- int ret;
-retry:
- bch2_trans_begin(trans);
+ struct btree_iter iter = { NULL };
- ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
- name, inum, 0);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
+ int ret = lockrestart_do(trans,
+ __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1e3431990a..21ffeb78f0 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, int);
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
new file mode 100644
index 0000000000..5e116b88e8
--- /dev/null
+++ b/fs/bcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 4d0cb0ccff..06a7df529b 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -89,19 +89,14 @@ err:
void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_disk_groups_cpu *g;
- struct bch_dev *ca;
- int i;
- unsigned iter;
-
out->atomic++;
rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
if (!g)
goto out;
- for (i = 0; i < g->nr; i++) {
+ for (unsigned i = 0; i < g->nr; i++) {
if (i)
prt_printf(out, " ");
@@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
}
prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
prt_printf(out, " %s", ca->name);
prt_printf(out, "]");
}
@@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ prt_printf(out, "/dev/%s", ca->name);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2a77de18c0..d503af2700 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -3,6 +3,7 @@
/* erasure coding */
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
#include "bkey_buf.h"
@@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+/* Triggers: */
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ int ret = 0;
+
+ if (deleting)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors);
+ if (ret)
+ goto err;
+
+ if (!deleting) {
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
+ a->v.data_type = BCH_DATA_stripe;
+ } else {
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ }
+
+ a->v.dirty_sectors += sectors;
+ if (data_type)
+ a->v.data_type = !deleting ? data_type : 0;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, ptr);
+
+ if (g->dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ bucket_lock(g);
+ old = *g;
+
+ ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors);
+ if (ret)
+ goto err;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_trigger_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s _new,
+ unsigned flags)
+{
+ struct bkey_s_c new = _new.s_c;
+ struct bch_fs *c = trans->c;
+ u64 idx = new.k->p.offset;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ /*
+ * If the pointers aren't changing, we don't need to do anything:
+ */
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
+
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, new);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, old);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_ATOMIC) {
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+ if (!m) {
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ bch2_inconsistent_error(c);
+ return -1;
+ }
+
+ if (!new_s) {
+ bch2_stripes_heap_del(c, m, idx);
+
+ memset(m, 0, sizeof(*m));
+ } else {
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+ if (!old_s)
+ bch2_stripes_heap_insert(c, m, idx);
+ else
+ bch2_stripes_heap_update(c, m, idx);
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->ptrs[i] = new_s->ptrs[i];
+
+ bch2_bkey_to_replicas(&m->r.e, new);
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++) {
+ int ret = mark_stripe_bucket(trans, new, i, flags);
+ if (ret)
+ return ret;
+ }
+
+ int ret = bch2_update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ 0, true);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* returns blocknr in stripe that we matched: */
static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
struct bkey_s_c k, unsigned *block)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
bkey_for_each_ptr(ptrs, ptr)
@@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret;
- u64 idx;
while (1) {
mutex_lock(&c->ec_stripes_heap_lock);
- idx = stripe_idx_to_delete(c);
+ u64 idx = stripe_idx_to_delete(c);
mutex_unlock(&c->ec_stripes_heap_lock);
if (!idx)
break;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
- ec_stripe_delete(trans, idx));
- if (ret) {
- bch_err_fn(c, ret);
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_delete(trans, idx));
+ bch_err_fn(c, ret);
+ if (ret)
break;
- }
}
- bch2_trans_put(trans);
-
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
@@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
@@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = bch2_trans_do(c, &s->res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
+ bch_err_msg(c, ret, "creating stripe key");
if (ret) {
- bch_err(c, "error creating stripe: error creating stripe key");
goto err;
}
ret = ec_stripe_update_extents(c, &s->new_stripe);
- if (ret) {
- bch_err_msg(c, ret, "creating stripe: error updating pointers");
+ bch_err_msg(c, ret, "error updating extents");
+ if (ret)
goto err;
- }
err:
bch2_disk_reservation_put(c, &s->res);
@@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r)
static unsigned pick_blocksize(struct bch_fs *c,
struct bch_devs_mask *devs)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+ unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
struct {
unsigned nr, size;
} cur = { 0, 0 }, best = { 0, 0 };
- for_each_member_device_rcu(ca, c, i, devs)
+ for_each_member_device_rcu(c, ca, devs)
sizes[nr++] = ca->mi.bucket_size;
sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
- for (i = 0; i < nr; i++) {
+ for (unsigned i = 0; i < nr; i++) {
if (sizes[i] != cur.size) {
if (cur.nr > best.nr)
best = cur;
@@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
enum bch_watermark watermark)
{
struct ec_stripe_head *h;
- struct bch_dev *ca;
- unsigned i;
h = kzalloc(sizeof(*h), GFP_KERNEL);
if (!h)
@@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
- __clear_bit(i, h->devs.d);
+ __clear_bit(ca->dev_idx, h->devs.d);
h->blocksize = pick_blocksize(c, &h->devs);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
goto found;
}
@@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_stripe *s;
- struct stripe *m;
- unsigned i;
- int ret;
-
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- s = bkey_s_c_to_stripe(k).v;
-
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- }
- bch2_trans_iter_exit(trans, &iter);
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- bch2_trans_put(trans);
+ struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- if (ret)
- bch_err_fn(c, ret);
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 7d0237c981..f4369b02e8 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_stripe, \
- .atomic_trigger = bch2_mark_stripe, \
+ .trigger = bch2_trigger_stripe, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
new file mode 100644
index 0000000000..44ce88ba08
--- /dev/null
+++ b/fs/bcachefs/ec_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e2b02a82de..976426da3a 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -5,7 +5,7 @@
#include "bcachefs_format.h"
struct bch_replicas_padded {
- struct bch_replicas_entry e;
+ struct bch_replicas_entry_v1 e;
u8 pad[BCH_BKEY_PTRS_MAX];
};
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index ac90faccdc..8c40c2067a 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \
@@ -152,7 +151,6 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
@@ -172,10 +170,12 @@
x(EINVAL, device_size_too_small) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
+ x(EINVAL, opt_parse_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -225,6 +225,7 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
x(EIO, sb_not_downgraded) \
+ x(EIO, btree_write_all_failed) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -236,6 +237,7 @@
x(BCH_ERR_nopromote, nopromote_unwritten) \
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
+ x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem)
enum bch_errcode {
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 25cf78a7b9..d32c8bebe4 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -2,12 +2,13 @@
#include "bcachefs.h"
#include "error.h"
#include "super.h"
+#include "thread_with_file.h"
#define FSCK_ERR_RATELIMIT_NR 10
bool bch2_inconsistent_error(struct bch_fs *c)
{
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
@@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
void bch2_topology_error(struct bch_fs *c)
{
- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ set_bit(BCH_FS_topology_error, &c->flags);
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
bch2_inconsistent_error(c);
}
@@ -69,40 +70,66 @@ enum ask_yn {
YN_ALLYES,
};
+static enum ask_yn parse_yn_response(char *buf)
+{
+ buf = strim(buf);
+
+ if (strlen(buf) == 1)
+ switch (buf[0]) {
+ case 'n':
+ return YN_NO;
+ case 'y':
+ return YN_YES;
+ case 'N':
+ return YN_ALLNO;
+ case 'Y':
+ return YN_ALLYES;
+ }
+ return -1;
+}
+
#ifdef __KERNEL__
-#define bch2_fsck_ask_yn() YN_NO
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+
+ if (!stdio)
+ return YN_NO;
+
+ char buf[100];
+ int ret;
+
+ do {
+ bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+
+ int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+ if (r < 0)
+ return YN_NO;
+ buf[r] = '\0';
+ } while ((ret = parse_yn_response(buf)) < 0);
+
+ return ret;
+}
#else
#include "tools-util.h"
-enum ask_yn bch2_fsck_ask_yn(void)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
{
char *buf = NULL;
size_t buflen = 0;
- bool ret;
+ int ret;
- while (true) {
+ do {
fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
fflush(stdout);
if (getline(&buf, &buflen, stdin) < 0)
die("error reading from standard input");
-
- strim(buf);
- if (strlen(buf) != 1)
- continue;
-
- switch (buf[0]) {
- case 'n':
- return YN_NO;
- case 'y':
- return YN_YES;
- case 'N':
- return YN_ALLNO;
- case 'Y':
- return YN_ALLYES;
- }
- }
+ } while ((ret = parse_yn_response(buf)) < 0);
free(buf);
return ret;
@@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
{
struct fsck_err_state *s;
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_error_msgs, list)
@@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c,
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
- if (test_bit(err, c->sb.errors_silent))
+ if ((flags & FSCK_CAN_FIX) &&
+ test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
bch2_sb_error_count(c, err);
@@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
@@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c,
int ask;
prt_str(out, ": fix?");
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn();
+ ask = bch2_fsck_ask_yn(c);
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
@@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
- if (print)
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (print) {
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s\n", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ }
- if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ if (test_bit(BCH_FS_fsck_running, &c->flags) &&
(ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore))
bch_err(c, "Unable to continue, halting");
@@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c,
bch2_inconsistent_error(c);
if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
} else {
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 21af6fb8ce..b9033bb4f1 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9d8afcb597..61395b113d 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -843,7 +844,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->dev == dev)
@@ -855,7 +855,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
@@ -1020,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type],
- bch2_compression_types[crc.compression_type]);
+ bch2_csum_types[crc.csum_type]);
+ bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1065,7 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr2;
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
@@ -1307,7 +1305,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -1338,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- unsigned target, unsigned compression)
+ struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
+ unsigned target = opts->background_target;
+ unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index a2ce8a3be1..6bf839d69e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
bkey_extent_entry_for_each_from(_p, _entry, _p.start)
#define __bkey_for_each_ptr(_start, _end, _ptr) \
- for ((_ptr) = (_start); \
+ for (typeof(_start) (_ptr) = (_start); \
((_ptr) = __bkey_ptr_next(_ptr, _end)); \
(_ptr)++)
@@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
@@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
.min_val_size = 40, \
})
@@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
/* KEY_TYPE_reservation: */
@@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
- .trans_trigger = bch2_trans_mark_reservation, \
- .atomic_trigger = bch2_mark_reservation, \
+ .trigger = bch2_trigger_reservation, \
.min_val_size = 8, \
})
@@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->unwritten)
@@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (!ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -716,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- unsigned, unsigned);
+ struct bch_io_opts *);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
new file mode 100644
index 0000000000..3bd2fdbb08
--- /dev/null
+++ b/fs/bcachefs/extents_format.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 05429c9631..b04750dbf8 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
- for ((_i) = eytzinger1_first((_size)); \
+ for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
- for ((_i) = eytzinger0_first((_size)); \
+ for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
@@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
- void *_base = (base); \
- void *_search = (search); \
- size_t _nr = (nr); \
- size_t _size = (size); \
- size_t _i = 0; \
+ void *_base = (base); \
+ const void *_search = (search); \
+ size_t _nr = (nr); \
+ size_t _size = (size); \
+ size_t _i = 0; \
int _res; \
\
while (_i < _nr && \
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 4496cf91a4..1c1ea0f0c6 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
- }
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
}
inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
@@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir.inum;
- inode_u->bi_dir_offset = dir_offset;
- }
+ inode_u->bi_dir = dir.inum;
+ inode_u->bi_dir_offset = dir_offset;
ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
@@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- src_inode_u->bi_dir = dst_dir_u->bi_inum;
- src_inode_u->bi_dir_offset = dst_offset;
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
- if (mode == BCH_RENAME_EXCHANGE) {
- dst_inode_u->bi_dir = src_dir_u->bi_inum;
- dst_inode_u->bi_dir_offset = src_offset;
- }
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
- if (mode == BCH_RENAME_OVERWRITE &&
- dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
- dst_inode_u->bi_dir_offset == src_offset) {
- dst_inode_u->bi_dir = 0;
- dst_inode_u->bi_dir_offset = 0;
- }
+ if (mode == BCH_RENAME_OVERWRITE &&
+ dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
+ dst_inode_u->bi_dir_offset == src_offset) {
+ dst_inode_u->bi_dir = 0;
+ dst_inode_u->bi_dir_offset = 0;
}
if (mode == BCH_RENAME_OVERWRITE) {
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 52f0e7acda..27710cdd57 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -52,26 +52,20 @@ struct readpages_iter {
static int readpages_iter_init(struct readpages_iter *iter,
struct readahead_control *ractl)
{
- struct folio **fi;
- int ret;
-
- memset(iter, 0, sizeof(*iter));
+ struct folio *folio;
- iter->mapping = ractl->mapping;
+ *iter = (struct readpages_iter) { ractl->mapping };
- ret = bch2_filemap_get_contig_folios_d(iter->mapping,
- ractl->_index << PAGE_SHIFT,
- (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
- 0, mapping_gfp_mask(iter->mapping),
- &iter->folios);
- if (ret)
- return ret;
+ while ((folio = __readahead_folio(ractl))) {
+ if (!bch2_folio_create(folio, GFP_KERNEL) ||
+ darray_push(&iter->folios, folio)) {
+ bch2_folio_release(folio);
+ ractl->_nr_pages += folio_nr_pages(folio);
+ ractl->_index -= folio_nr_pages(folio);
+ return iter->folios.nr ? 0 : -ENOMEM;
+ }
- darray_for_each(iter->folios, fi) {
- ractl->_nr_pages -= 1U << folio_order(*fi);
- __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
- folio_put(*fi);
- folio_put(*fi);
+ folio_put(folio);
}
return 0;
@@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl)
struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
- int ret;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- ret = readpages_iter_init(&readpages_iter, ractl);
- BUG_ON(ret);
+ int ret = readpages_iter_init(&readpages_iter, ractl);
+ if (ret)
+ return;
bch2_pagecache_add_get(inode);
@@ -309,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl)
darray_exit(&readpages_iter.folios);
}
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum, struct folio *folio)
-{
- bch2_folio_create(folio, __GFP_NOFAIL);
-
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
-}
-
static void bch2_read_single_folio_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -335,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ if (!bch2_folio_create(folio, GFP_KERNEL))
+ return -ENOMEM;
+
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
@@ -342,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
- __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+ bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -638,7 +627,7 @@ do_io:
/* Check for writing past i_size: */
WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+ !test_bit(BCH_FS_emergency_ro, &c->flags),
"writing past i_size: %llu > %llu (unrounded %llu)\n",
bio_end_sector(&w->io->op.wbio.bio) << 9,
round_up(i_size, block_bytes(c)),
@@ -826,7 +815,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
folios fs;
- struct folio **fi, *f;
+ struct folio *f;
unsigned copied = 0, f_offset, f_copied;
u64 end = pos + len, f_pos, f_len;
loff_t last_folio_pos = inode->v.i_size;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 84e20c3ada..33cb6da3a5 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -77,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- if ((offset|iter->count) & (block_bytes(c) - 1))
+ /* bios must be 512 byte aligned: */
+ if ((offset|iter->count) & (SECTOR_SIZE - 1))
return -EINVAL;
ret = min_t(loff_t, iter->count,
@@ -87,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
return ret;
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+ if (shorten >= iter->count)
+ shorten = 0;
iter->count -= shorten;
bio = bio_alloc_bioset(NULL,
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index ff664fd0d8..d359aa9b33 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 *start, u64 end,
+ bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
- unsigned i, j;
+ int ret = 0;
- if (end <= start)
- return;
+ if (end <= *start)
+ return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
+
+ if (!nonblocking)
+ folio_lock(folio);
+ else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ ret = -EAGAIN;
+ break;
+ }
+
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
BUG_ON(end <= folio_start);
- folio_lock(folio);
- s = bch2_folio(folio);
+ *start = min(end, folio_end);
+ struct bch_folio *s = bch2_folio(folio);
if (s) {
+ unsigned folio_offset = max(*start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 27f712ae37..8cbaba6565 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index fffa1743df..8c70123b6a 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, ret2, ret3;
+ int ret;
ret = file_write_and_wait_range(file, start, end);
- ret2 = sync_inode_metadata(&inode->v, 1);
- ret3 = bch2_flush_inode(c, inode);
-
- return bch2_err_class(ret ?: ret2 ?: ret3);
+ if (ret)
+ goto out;
+ ret = sync_inode_metadata(&inode->v, 1);
+ if (ret)
+ goto out;
+ ret = bch2_flush_inode(c, inode);
+out:
+ return bch2_err_class(ret);
}
/* truncate: */
@@ -671,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- drop_locks_do(trans,
- (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ if (bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, true))
+ drop_locks_do(trans,
+ bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -861,7 +868,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ lock_two_nondirectories(&src->v, &dst->v);
+ bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
@@ -914,7 +922,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, &quota_res);
- bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+ unlock_two_nondirectories(&src->v, &dst->v);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index dbc87747ea..3dc8630ff9 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
bch_notice(c, "shutdown by ioctl type %u", flags);
- down_write(&c->vfs_sb->s_umount);
-
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(c->vfs_sb->s_bdev);
+ ret = bdev_freeze(c->vfs_sb->s_bdev);
if (ret)
- goto err;
-
+ break;
bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
- thaw_bdev(c->vfs_sb->s_bdev);
+ bdev_thaw(c->vfs_sb->s_bdev);
break;
-
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
-
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
break;
default:
ret = -EINVAL;
break;
}
-err:
- up_write(&c->vfs_sb->s_umount);
+
return ret;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 49da8db1d9..77ae65542d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
- return ret;
+ return bch2_err_class(ret);
ihold(&inode->v);
d_instantiate(dentry, &inode->v);
@@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
@@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
__bch2_unlink(vdir, dentry, false);
+ return bch2_err_class(ret);
}
static int bch2_symlink(struct mnt_idmap *idmap,
@@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
return 0;
err:
iput(&inode->v);
- return ret;
+ return bch2_err_class(ret);
}
static int bch2_mkdir(struct mnt_idmap *idmap,
@@ -641,7 +642,7 @@ err:
src_inode,
dst_inode);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_setattr_copy(struct mnt_idmap *idmap,
@@ -729,7 +730,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1012,15 +1013,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
if (!dir_emit_dots(file, ctx))
return 0;
- ret = bch2_readdir(c, inode_inum(inode), ctx);
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ bch_err_fn(c, ret);
return bch2_err_class(ret);
}
@@ -1131,7 +1130,7 @@ static const struct address_space_operations bch_address_space_operations = {
#ifdef CONFIG_MIGRATION
.migrate_folio = filemap_migrate_folio,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
struct bcachefs_fid {
@@ -1500,7 +1499,7 @@ static void bch2_evict_inode(struct inode *vinode)
void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
{
- struct bch_inode_info *inode, **i;
+ struct bch_inode_info *inode;
DARRAY(struct bch_inode_info *) grabbed;
bool clean_pass = false, this_pass_clean;
@@ -1626,43 +1625,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static char **split_devs(const char *_dev_name, unsigned *nr)
-{
- char *dev_name = NULL, **devs = NULL, *s;
- size_t i = 0, nr_devs = 0;
-
- dev_name = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return NULL;
-
- for (s = dev_name; s; s = strchr(s + 1, ':'))
- nr_devs++;
-
- devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
- if (!devs) {
- kfree(dev_name);
- return NULL;
- }
-
- while ((s = strsep(&dev_name, ":")))
- devs[i++] = s;
-
- *nr = nr_devs;
- return devs;
-}
-
static int bch2_remount(struct super_block *sb, int *flags, char *data)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_opts opts = bch2_opts_empty();
int ret;
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
goto err;
+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
@@ -1696,11 +1670,9 @@ err:
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- struct bch_dev *ca;
- unsigned i;
bool first = true;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
@@ -1770,7 +1742,7 @@ static int bch2_unfreeze(struct super_block *sb)
struct bch_fs *c = sb->s_fs_info;
int ret;
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ if (test_bit(BCH_FS_emergency_ro, &c->flags))
return 0;
down_write(&c->state_lock);
@@ -1805,17 +1777,18 @@ static int bch2_noset_super(struct super_block *s, void *data)
return -EBUSY;
}
+typedef DARRAY(struct bch_fs *) darray_fs;
+
static int bch2_test_super(struct super_block *s, void *data)
{
struct bch_fs *c = s->s_fs_info;
- struct bch_fs **devs = data;
- unsigned i;
+ darray_fs *d = data;
if (!c)
return false;
- for (i = 0; devs[i]; i++)
- if (c != devs[i])
+ darray_for_each(*d, i)
+ if (c != *i)
return false;
return true;
}
@@ -1824,13 +1797,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
struct bch_fs *c;
- struct bch_dev *ca;
struct super_block *sb;
struct inode *vinode;
struct bch_opts opts = bch2_opts_empty();
- char **devs;
- struct bch_fs **devs_to_fs = NULL;
- unsigned i, nr_devs;
int ret;
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
@@ -1842,25 +1811,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
- devs = split_devs(dev_name, &nr_devs);
- if (!devs)
- return ERR_PTR(-ENOMEM);
+ darray_str devs;
+ ret = bch2_split_devs(dev_name, &devs);
+ if (ret)
+ return ERR_PTR(ret);
- devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
- if (!devs_to_fs) {
- sb = ERR_PTR(-ENOMEM);
- goto got_sb;
+ darray_fs devs_to_fs = {};
+ darray_for_each(devs, i) {
+ ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
+ if (ret) {
+ sb = ERR_PTR(ret);
+ goto got_sb;
+ }
}
- for (i = 0; i < nr_devs; i++)
- devs_to_fs[i] = bch2_path_to_fs(devs[i]);
-
- sb = sget(fs_type, bch2_test_super, bch2_noset_super,
- flags|SB_NOSEC, devs_to_fs);
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
if (!IS_ERR(sb))
goto got_sb;
- c = bch2_fs_open(devs, nr_devs, opts);
+ c = bch2_fs_open(devs.data, devs.nr, opts);
if (IS_ERR(c)) {
sb = ERR_CAST(c);
goto got_sb;
@@ -1880,9 +1849,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (IS_ERR(sb))
bch2_fs_stop(c);
got_sb:
- kfree(devs_to_fs);
- kfree(devs[0]);
- kfree(devs);
+ darray_exit(&devs_to_fs);
+ bch2_darray_str_exit(&devs);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
@@ -1923,7 +1891,7 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct block_device *bdev = ca->disk_sb.bdev;
/* XXX: create an anonymous device for multi device filesystems */
@@ -1944,10 +1912,9 @@ got_sb:
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
- if (ret) {
- bch_err_msg(c, ret, "mounting: error getting root inode");
+ bch_err_msg(c, ret, "mounting: error getting root inode");
+ if (ret)
goto err_put_super;
- }
sb->s_root = d_make_root(vinode);
if (!sb->s_root) {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 5edf1d4b9e..c3af7225ff 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
}
enum bch_inode_lock_op {
- INODE_LOCK = (1U << 0),
- INODE_PAGECACHE_BLOCK = (1U << 1),
- INODE_UPDATE_LOCK = (1U << 2),
+ INODE_PAGECACHE_BLOCK = (1U << 0),
+ INODE_UPDATE_LOCK = (1U << 1),
};
#define bch2_lock_inodes(_locks, ...) \
@@ -91,8 +90,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
@@ -109,8 +106,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e0c5cd119a..6a760777ba 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -20,8 +20,6 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -29,19 +27,16 @@
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
u64 sectors = 0;
- int ret;
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
- 0, k, ret)
+ 0, k, ({
if (bkey_extent_is_allocation(k.k))
sectors += k.k->size;
-
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: sectors;
}
@@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
u64 subdirs = 0;
- int ret;
-
- for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
- d = bkey_s_c_to_dirent(k);
- if (d.v->d_type == DT_DIR)
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ({
+ if (k.k->type == KEY_TYPE_dirent &&
+ bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
subdirs++;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: subdirs;
}
-static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
- u32 *subvol)
-{
- struct bch_snapshot s;
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
- POS(0, snapshot), 0,
- snapshot, &s);
- if (!ret)
- *subvol = le32_to_cpu(s.subvol);
- else if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", snapshot);
- return ret;
-
-}
-
-static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
int ret;
@@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
-{
- return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
-}
-
static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
@@ -132,7 +99,7 @@ err:
return ret;
}
-static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
@@ -152,29 +119,19 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
-{
- return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
-}
-
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type)
+ u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0);
+ int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
if (ret)
return ret;
@@ -207,12 +164,9 @@ static int fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __write_inode(trans, inode, snapshot));
- if (ret)
- bch_err_fn(trans->c, ret);
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
return ret;
}
@@ -242,35 +196,44 @@ err:
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
- struct bch_inode_unpacked root;
- struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- subvol_inum root_inum = { .subvol = subvol };
u64 inum = 0;
unsigned d_type = 0;
- u32 snapshot;
int ret;
- ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ struct bch_snapshot_tree st;
+ ret = bch2_snapshot_tree_lookup(trans,
+ bch2_snapshot_tree(c, snapshot), &st);
+ if (ret)
+ return ret;
+
+ subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+ u32 subvol_snapshot;
+
+ ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
+ &subvol_snapshot, &root_inum.inum);
+ bch_err_msg(c, ret, "looking up root subvol");
if (ret)
return ret;
- ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ struct bch_inode_unpacked root_inode;
+ struct bch_hash_info root_hash_info;
+ u32 root_inode_snapshot = snapshot;
+ ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
+ bch_err_msg(c, ret, "looking up root inode");
if (ret)
return ret;
- root_hash_info = bch2_hash_info_init(c, &root);
+ root_hash_info = bch2_hash_info_init(c, &root_inode);
- ret = __lookup_dirent(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type);
- if (bch2_err_matches(ret, ENOENT)) {
- bch_notice(c, "creating lost+found");
+ ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type, snapshot);
+ if (bch2_err_matches(ret, ENOENT))
goto create_lostfound;
- }
bch_err_fn(c, ret);
if (ret)
@@ -285,20 +248,53 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- return __lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+ inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+ return ret;
create_lostfound:
+ /*
+ * XXX: we could have a nicer log message here if we had a nice way to
+ * walk backpointers to print a path
+ */
+ bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+
+ u64 now = bch2_current_time(c);
+ struct btree_iter lostfound_iter = { NULL };
+ u64 cpu = raw_smp_processor_id();
+
bch2_inode_init_early(c, lostfound);
+ bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+ lostfound->bi_dir = root_inode.bi_inum;
+
+ root_inode.bi_nlink++;
+
+ ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
+ if (ret)
+ goto err;
- ret = bch2_create_trans(trans, root_inum, &root,
- lostfound, &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL,
- (subvol_inum) { }, 0);
+ bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(&lostfound_iter);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_create_snapshot(trans,
+ root_inode.bi_inum, snapshot, &root_hash_info,
+ mode_to_type(lostfound->bi_mode),
+ &lostfound_str,
+ lostfound->bi_inum,
+ &lostfound->bi_dir_offset,
+ BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
bch_err_msg(c, ret, "creating lost+found");
+ bch2_trans_iter_exit(trans, &lostfound_iter);
return ret;
}
-static int __reattach_inode(struct btree_trans *trans,
+static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
@@ -307,14 +303,9 @@ static int __reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
- u32 subvol;
int ret;
- ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
- if (ret)
- return ret;
-
- ret = lookup_lostfound(trans, subvol, &lostfound);
+ ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
if (ret)
return ret;
@@ -331,15 +322,12 @@ static int __reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
- ret = bch2_dirent_create(trans,
- (subvol_inum) {
- .subvol = subvol,
- .inum = lostfound.bi_inum,
- },
- &dir_hash,
- inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ ret = bch2_dirent_create_snapshot(trans,
+ lostfound.bi_inum, inode_snapshot,
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -349,18 +337,6 @@ static int __reattach_inode(struct btree_trans *trans,
return __write_inode(trans, inode, inode_snapshot);
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 inode_snapshot)
-{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- __reattach_inode(trans, inode, inode_snapshot));
- bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
- return ret;
-}
-
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
@@ -405,7 +381,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
};
int ret = 0;
- darray_for_each(s->ids, i) {
+ __darray_for_each(s->ids, i) {
if (i->id == id)
return 0;
if (i->id > id)
@@ -422,7 +398,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry *i, n = {
+ struct snapshots_seen_entry n = {
.id = pos.snapshot,
.equiv = bch2_snapshot_equiv(c, pos.snapshot),
};
@@ -448,7 +424,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
@@ -593,14 +569,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- u32 restart_count = trans->restart_count;
int ret;
w->recalculate_sums = false;
w->inodes.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -613,8 +588,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
-
- return trans_was_restarted(trans, restart_count);
+ return 0;
}
static struct inode_walker_entry *
@@ -625,7 +599,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
snapshot = bch2_snapshot_equiv(c, snapshot);
- darray_for_each(w->inodes, i)
+ __darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -667,11 +641,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
} else if (bkey_cmp(w->last_pos, pos)) {
- struct inode_walker_entry *i;
-
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
-
}
w->last_pos = pos;
@@ -756,9 +727,7 @@ static int hash_redo_key(struct btree_trans *trans,
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static int hash_check_key(struct btree_trans *trans,
@@ -826,6 +795,18 @@ fsck_err:
goto out;
}
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return k.k->type == KEY_TYPE_set;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -867,7 +848,7 @@ static int check_inode(struct btree_trans *trans,
c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ return -BCH_ERR_fsck_repair_unimplemented;
}
if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
@@ -890,14 +871,22 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
+ if (u.bi_flags & BCH_INODE_unlinked) {
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
+ ret = 0;
+ }
+
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode");
return ret;
@@ -910,9 +899,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
/*
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
@@ -976,27 +962,22 @@ fsck_err:
return ret;
}
-noinline_for_stack
int bch2_check_inodes(struct bch_fs *c)
{
bool full = c->opts.fsck;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
struct snapshots_seen s;
- struct bkey_s_c k;
- int ret;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_inode(trans, &iter, k, &prev, &s, full));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_inode(trans, &iter, k, &prev, &s, full)));
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
@@ -1023,29 +1004,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
-
- d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
- ret = bkey_err(d);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- ret = dirent_points_to_inode(d, inode);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1094,11 +1055,8 @@ struct extent_ends {
static void extent_ends_reset(struct extent_ends *extent_ends)
{
- struct extent_end *i;
-
darray_for_each(extent_ends->e, i)
snapshots_seen_exit(&i->seen);
-
extent_ends->e.nr = 0;
}
@@ -1130,7 +1088,7 @@ static int extent_ends_at(struct bch_fs *c,
if (!n.seen.ids.data)
return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
- darray_for_each(extent_ends->e, i) {
+ __darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
snapshots_seen_exit(&i->seen);
*i = n;
@@ -1220,13 +1178,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
swap(k1, k2);
}
- trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+ trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
- bch2_trans_commit(trans, &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
@@ -1270,7 +1227,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
bool *fixed)
{
struct bch_fs *c = trans->c;
- struct extent_end *i;
int ret = 0;
/* transaction restart, running again */
@@ -1451,32 +1407,28 @@ int bch2_check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct snapshots_seen s;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct extent_ends extent_ends;
struct disk_reservation res = { 0 };
- int ret = 0;
snapshots_seen_init(&s);
extent_ends_init(&extent_ends);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
- check_extent_overbig(trans, &iter, k);
- })) ?:
- check_i_sectors(trans, &w);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
+ })) ?:
+ check_i_sectors(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
inode_walker_exit(&w);
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
@@ -1484,24 +1436,19 @@ int bch2_check_extents(struct bch_fs *c)
int bch2_check_indirect_extents(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct disk_reservation res = { 0 };
- int ret = 0;
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
- POS_MIN,
- BTREE_ITER_PREFETCH, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
- }));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ })));
bch2_disk_reservation_put(c, &res);
- bch2_trans_put(trans);
-
bch_err_fn(c, ret);
return ret;
}
@@ -1509,7 +1456,6 @@ int bch2_check_indirect_extents(struct bch_fs *c)
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1553,8 +1499,8 @@ static int check_dirent_target(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
- bool backpointer_exists = true;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (!target->bi_dir &&
@@ -1568,25 +1514,37 @@ static int check_dirent_target(struct btree_trans *trans,
}
if (!inode_points_to_dirent(target, d)) {
- ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
- if (ret < 0)
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- backpointer_exists = ret;
+ bool backpointer_exists = !ret;
ret = 0;
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
c, inode_dir_multiple_links,
- "directory %llu with multiple links",
- target->bi_inum)) {
+ "directory %llu:%u with multiple links\n%s",
+ target->bi_inum, target_snapshot, buf.buf)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
c, inode_multiple_links_but_nlink_0,
- "inode %llu type %s has multiple links but i_nlink 0",
- target->bi_inum, bch2_d_types[d.v->d_type])) {
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_unlinked;
@@ -1636,13 +1594,12 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
- if (d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
- (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
- fsck_err(c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol))) {
+ if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
+ c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol)) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -1660,6 +1617,7 @@ static int check_dirent_target(struct btree_trans *trans,
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -1701,7 +1659,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- BUG_ON(!iter->path->should_be_locked);
+ BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
ret = PTR_ERR_OR_ZERO(i);
@@ -1754,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
u32 target_snapshot;
u64 target_inum;
- ret = __subvol_lookup(trans, target_subvol,
+ ret = subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1766,7 +1724,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- ret = __lookup_inode(trans, target_inum,
+ ret = lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1842,22 +1800,18 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -1908,8 +1862,6 @@ int bch2_check_xattrs(struct bch_fs *c)
{
struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
- struct btree_iter iter;
- struct bkey_s_c k;
int ret = 0;
ret = bch2_trans_run(c,
@@ -1918,7 +1870,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret);
return ret;
@@ -1932,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans)
u64 inum;
int ret;
- ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+ ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1948,18 +1900,13 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
- &root_subvol.k_i, 0));
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
-
}
- ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1983,11 +1930,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret;
-
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2002,13 +1945,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf;
static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
- struct pathbuf_entry *i;
-
darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
-
return false;
}
@@ -2057,10 +1997,10 @@ static int check_path(struct btree_trans *trans,
break;
}
- ret = lockrestart_do(trans,
- PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot))).k));
+ d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot));
+ ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
@@ -2097,13 +2037,12 @@ static int check_path(struct btree_trans *trans,
ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
if (ret) {
/* Should have been caught in dirents pass */
- bch_err(c, "error looking up parent directory: %i", ret);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error looking up parent directory: %i", ret);
break;
}
if (path_is_dup(p, inode->bi_inum, snapshot)) {
- struct pathbuf_entry *i;
-
/* XXX print path */
bch_err(c, "directory structure loop");
@@ -2111,20 +2050,19 @@ static int check_path(struct btree_trans *trans,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
- if (!fsck_err(c, dir_loop,
- "directory structure loop"))
+ if (!fsck_err(c, dir_loop, "directory structure loop"))
return 0;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- remove_backpointer(trans, inode));
- if (ret) {
- bch_err(c, "error removing dirent: %i", ret);
+ ret = remove_backpointer(trans, inode);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
break;
- }
ret = reattach_inode(trans, inode, snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ break;
}
}
fsck_err:
@@ -2139,37 +2077,28 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- ret = bch2_inode_unpack(k, &u);
- if (ret) {
- /* Should have been caught earlier in fsck: */
- bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
- break;
- }
+ BUG_ON(bch2_inode_unpack(k, &u));
- if (u.bi_flags & BCH_INODE_unlinked)
- continue;
+ if (u.bi_flags & BCH_INODE_unlinked)
+ continue;
- ret = check_path(trans, &path, &u, iter.pos.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ check_path(trans, &path, &u, iter.pos.snapshot);
+ })));
darray_exit(&path);
+
bch_err_fn(c, ret);
return ret;
}
@@ -2255,47 +2184,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
struct nlink_table *t,
u64 start, u64 *end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked u;
- int ret = 0;
-
- for_each_btree_key(trans, iter, BTREE_ID_inodes,
- POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
-
- /* Should never fail, checked by bch2_inode_invalid: */
- BUG_ON(bch2_inode_unpack(k, &u));
-
- /*
- * Backpointer and directory structure checks are sufficient for
- * directories, since they can't have hardlinks:
- */
- if (S_ISDIR(u.bi_mode))
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_inodes,
+ POS(0, start),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- if (!u.bi_nlink)
- continue;
+ /* Should never fail, checked by bch2_inode_invalid: */
+ struct bch_inode_unpacked u;
+ BUG_ON(bch2_inode_unpack(k, &u));
- ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
- if (ret) {
- *end = k.k->p.offset;
- ret = 0;
- break;
- }
+ /*
+ * Backpointer and directory structure checks are sufficient for
+ * directories, since they can't have hardlinks:
+ */
+ if (S_ISDIR(u.bi_mode))
+ continue;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ if (!u.bi_nlink)
+ continue;
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+ if (ret) {
+ *end = k.k->p.offset;
+ ret = 0;
+ break;
+ }
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
@@ -2303,42 +2224,34 @@ noinline_for_stack
static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct snapshots_seen s;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- int ret;
snapshots_seen_init(&s);
- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
- if (ret)
- break;
-
- switch (k.k->type) {
- case KEY_TYPE_dirent:
- d = bkey_s_c_to_dirent(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+ if (ret)
+ break;
- if (d.v->d_type != DT_DIR &&
- d.v->d_type != DT_SUBVOL)
- inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
+ if (k.k->type == KEY_TYPE_dirent) {
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
+ }
+ 0;
+ })));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
+
+ bch_err_fn(c, ret);
return ret;
}
@@ -2389,19 +2302,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
return ret;
}
@@ -2447,7 +2357,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
{
struct bkey_s_c_reflink_p p;
struct bkey_i_reflink_p *u;
- int ret;
if (k.k->type != KEY_TYPE_reflink_p)
return 0;
@@ -2458,7 +2367,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
return 0;
u = bch2_trans_kmalloc(trans, sizeof(*u));
- ret = PTR_ERR_OR_ZERO(u);
+ int ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
@@ -2471,19 +2380,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
int bch2_fix_reflink_p(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
return 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9309cfeecd..086f0090b0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -506,22 +506,33 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode=%o ", inode->bi_mode);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "mode=%o", inode->bi_mode);
+ prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
+ prt_newline(out);
- prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
- inode->bi_journal_seq,
- inode->bi_size,
- inode->bi_sectors,
- inode->bi_version);
+ prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_newline(out);
+
+ prt_printf(out, "bi_size=%llu", inode->bi_size);
+ prt_newline(out);
+
+ prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ prt_printf(out, #_name "=%llu", (u64) inode->_name); \
+ prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
+ printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -561,64 +572,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
return bkey_inode_flags(k) & BCH_INODE_unlinked;
}
-int bch2_trans_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
{
- int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
- bool old_deleted = bkey_is_deleted_inode(old);
- bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+ s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if (nr) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- struct replicas_delta_list *d = trans->fs_usage_deltas;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (nr) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- if (ret)
- return ret;
+ trans->fs_usage_deltas->nr_inodes += nr;
+ }
- d->nr_inodes += nr;
+ bool old_deleted = bkey_is_deleted_inode(old);
+ bool new_deleted = bkey_is_deleted_inode(new.s_c);
+ if (old_deleted != new_deleted) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ if (ret)
+ return ret;
+ }
}
- if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
- if (ret)
- return ret;
- }
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ BUG_ON(!trans->journal_res.seq);
- return 0;
-}
-
-int bch2_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- u64 journal_seq = trans->journal_res.seq;
-
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
-
- BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_inode_v3);
-
- v->bi_journal_seq = cpu_to_le64(journal_seq);
+ bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
if (flags & BTREE_TRIGGER_GC) {
- percpu_down_read(&c->mark_lock);
- preempt_disable();
+ struct bch_fs *c = trans->c;
- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
- fs_usage->nr_inodes += bkey_is_inode(new.k);
- fs_usage->nr_inodes -= bkey_is_inode(old.k);
-
- preempt_enable();
+ percpu_down_read(&c->mark_lock);
+ this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}
+
return 0;
}
@@ -831,7 +824,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
@@ -894,7 +887,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1058,7 +1051,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1155,51 +1148,48 @@ delete:
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
bool need_another_pass;
int ret;
again:
need_another_pass = false;
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
* but we can't retry because the btree write buffer won't have been
* flushed and we'd spin:
*/
- for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
- if (ret < 0)
- break;
-
- if (ret) {
- if (!test_bit(BCH_FS_RW, &c->flags)) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
- }
-
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ if (ret > 0) {
bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
+ /*
+ * We don't want to loop here: a transaction restart
+ * error here means we handled a transaction restart and
+ * we're actually done, but if we loop we'll retry the
+ * same key because the write buffer hasn't been flushed
+ * yet
+ */
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
}
- }
- bch2_trans_iter_exit(trans, &iter);
- if (!ret && need_another_pass)
+ ret;
+ }));
+
+ if (!ret && need_another_pass) {
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
goto again;
+ }
err:
bch2_trans_put(trans);
-
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 88818a332b..b63f312581 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 32, \
})
#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v3_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 48, \
})
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
new file mode 100644
index 0000000000..83d107331e
--- /dev/null
+++ b/fs/bcachefs/inode_format.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX 4096
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bebc11444e..1baf78594c 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct open_buckets open_buckets = { 0 };
struct bkey_s_c k;
struct bkey_buf old, new;
- unsigned sectors_allocated = 0;
- bool have_reservation = false;
+ unsigned sectors_allocated = 0, new_replicas;
bool unwritten = opts.nocow &&
c->sb.version >= bcachefs_metadata_version_unwritten_extents;
int ret;
@@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans,
return ret;
sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+ new_replicas = max(0, (int) opts.data_replicas -
+ (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- if (!have_reservation) {
- unsigned new_replicas =
- max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto err;
-
- bch2_bkey_buf_reassemble(&old, c, k);
- }
+ /*
+ * Get a disk reservation before (in the nocow case) calling
+ * into the allocator:
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+ if (unlikely(ret))
+ goto err_noprint;
- if (have_reservation) {
- if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
- goto err;
+ bch2_bkey_buf_reassemble(&old, c, k);
- bch2_key_resize(&new.k->k, sectors);
- } else if (!unwritten) {
+ if (!unwritten) {
struct bkey_i_reservation *reservation;
bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
@@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct bkey_i_extent *e;
struct bch_devs_list devs_have;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
devs_have.nr = 0;
@@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans,
ptr->unwritten = true;
}
- have_reservation = true;
-
ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
0, i_sectors_delta, true);
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
-
+ if (should_print_err(ret))
+ bch_err_inum_offset_ratelimited(c,
+ inum.inum,
+ iter->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
+err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c);
@@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size));
if (ret)
goto err;
@@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
@@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i));
}
@@ -449,13 +442,11 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_disk_reservation_put(c, &disk_res);
@@ -470,12 +461,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i));
}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 36763865fa..3c574d8873 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -80,7 +80,7 @@ struct promote_op {
struct bpos pos;
struct data_update write;
- struct bio_vec bi_inline_vecs[0]; /* must be last */
+ struct bio_vec bi_inline_vecs[]; /* must be last */
};
static const struct rhashtable_params bch_promote_params = {
@@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
int ret;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return NULL;
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
- if (!op)
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ if (!op) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
op->start_time = local_clock();
op->pos = pos;
@@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
*/
*rbio = kzalloc(sizeof(struct bch_read_bio) +
sizeof(struct bio_vec) * pages,
- GFP_NOFS);
- if (!*rbio)
+ GFP_KERNEL);
+ if (!*rbio) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
rbio_init(&(*rbio)->bio, opts);
bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
- GFP_NOFS))
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
(*rbio)->bounce = true;
(*rbio)->split = true;
(*rbio)->kmalloc = true;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params))
+ bch_promote_params)) {
+ ret = -BCH_ERR_nopromote_in_flight;
goto err;
+ }
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
@@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
if (ret) {
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params));
goto err;
}
@@ -239,7 +245,7 @@ err:
*rbio = NULL;
kfree(op);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- return NULL;
+ return ERR_PTR(ret);
}
noinline
@@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
? BTREE_ID_reflink
: BTREE_ID_extents,
k, pos, pick, opts, sectors, rbio);
- if (!promote) {
- ret = -BCH_ERR_nopromote_enomem;
+ ret = PTR_ERR_OR_ZERO(promote);
+ if (ret)
goto nopromote;
- }
*bounce = true;
*read_full = promote_full;
@@ -526,7 +531,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio));
}
@@ -637,12 +642,17 @@ csum_err:
goto out;
}
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
bch_err_inum_offset_ratelimited(ca,
rbio->read_pos.inode,
rbio->read_pos.offset << 9,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+ "data %s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 8c8cb1541a..2c098ac017 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret))
return ret;
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -396,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
bool nocow)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
- struct bch_dev *ca;
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
- !c->devs[ptr->dev]);
+ BUG_ON(!bch2_dev_exists2(c, ptr->dev));
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (to_entry(ptr + 1) < ptrs.end) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
@@ -1109,16 +1104,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
- const struct bch_extent_ptr *ptr;
- struct bkey_i *k;
for_each_keylist_key(&op->insert_keys, k) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
bkey_for_each_ptr(ptrs, ptr)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
+ PTR_BUCKET_POS(c, ptr),
+ BUCKET_NOCOW_LOCK_UPDATE);
}
}
@@ -1128,25 +1121,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct bkey_s_c k,
u64 new_i_size)
{
- struct bkey_i *new;
- struct bkey_ptrs ptrs;
- struct bch_extent_ptr *ptr;
- int ret;
-
if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
/* trace this */
return 0;
}
- new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
bch2_cut_front(bkey_start_pos(&orig->k), new);
bch2_cut_back(orig->k.p, new);
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr)
ptr->unwritten = 0;
@@ -1167,16 +1155,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i *orig;
- struct bkey_s_c k;
- int ret;
for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1228,10 +1212,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
- struct bucket_to_lock *i;
u32 snapshot;
struct bucket_to_lock *stale_at;
int ret;
@@ -1273,7 +1254,7 @@ retry:
break;
/* Get iorefs before dropping btree locks: */
- ptrs = bch2_bkey_ptrs_c(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
struct bpos b = PTR_BUCKET_POS(c, ptr);
struct nocow_lock_bucket *l =
@@ -1464,6 +1445,11 @@ err:
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1578,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8cf238be62..bc890776eb 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -10,6 +10,7 @@
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "error.h"
#include "journal.h"
@@ -26,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ unsigned i = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + i;
+
+ prt_printf(out, "seq:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "size:");
+ prt_tab(out);
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++)
+ bch2_journal_buf_to_text(out, j, seq);
+}
+
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -155,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -184,6 +226,18 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ if (trace_journal_entry_close_enabled() && trace) {
+ struct printbuf pbuf = PRINTBUF;
+ pbuf.atomic++;
+
+ prt_str(&pbuf, "entry size: ");
+ prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+ prt_newline(&pbuf);
+ bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+ trace_journal_entry_close(c, pbuf.buf);
+ printbuf_exit(&pbuf);
+ }
+
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
@@ -222,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@@ -236,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@@ -330,6 +384,7 @@ static int journal_entry_open(struct journal *j)
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
+ buf->need_flush_to_write_buffer = true;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -363,11 +418,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- if (j->res_get_blocked_start)
- bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -407,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@@ -464,18 +514,23 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
- if (ret == JOURNAL_ERR_max_in_flight)
- trace_and_count(c, journal_entry_full, c);
-unlock:
- if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
- !j->res_get_blocked_start) {
- j->res_get_blocked_start = local_clock() ?: 1;
- trace_and_count(c, journal_full, c);
- }
+ if (ret == JOURNAL_ERR_max_in_flight) {
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, true);
+ if (trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ count_event(c, journal_entry_full);
+ }
+unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -553,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -610,7 +665,7 @@ recheck_need_open:
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@@ -774,6 +829,48 @@ void bch2_journal_block(struct journal *j)
journal_quiesce(j);
}
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret = NULL;
+
+ mutex_lock(&j->buf_lock);
+ spin_lock(&j->lock);
+ max_seq = min(max_seq, journal_cur_seq(j));
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= max_seq;
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + idx;
+
+ if (buf->need_flush_to_write_buffer) {
+ if (seq == journal_cur_seq(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+
+ union journal_res_state s;
+ s.v = atomic64_read_acquire(&j->reservations.counter);
+
+ ret = journal_state_count(s, idx)
+ ? ERR_PTR(-EAGAIN)
+ : buf;
+ break;
+ }
+ }
+
+ spin_unlock(&j->lock);
+ if (IS_ERR_OR_NULL(ret))
+ mutex_unlock(&j->buf_lock);
+ return ret;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+ return ret;
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -955,8 +1052,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
break;
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
unlock:
up_write(&c->state_lock);
return ret;
@@ -986,17 +1082,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
err:
- if (ret)
- bch_err_fn(ca, ret);
+ bch_err_fn(ca, ret);
return ret;
}
int bch2_fs_journal_alloc(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->journal.nr)
continue;
@@ -1225,6 +1317,7 @@ int bch2_fs_journal_init(struct journal *j)
static struct lock_class_key res_key;
unsigned i;
+ mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
@@ -1260,10 +1353,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
- struct bch_dev *ca;
unsigned long now = jiffies;
- u64 seq;
- unsigned i;
+ u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@@ -1275,20 +1366,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "average write size:\t");
+ prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+ prt_newline(out);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
@@ -1304,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
-
- for (seq = journal_cur_seq(j);
- seq >= journal_last_unwritten_seq(j);
- --seq) {
- i = seq & JOURNAL_BUF_MASK;
-
- prt_printf(out, "unwritten entry:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
-
- prt_printf(out, "sectors:");
- prt_tab(out);
- prt_printf(out, "%u", j->buf[i].sectors);
- prt_newline(out);
-
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
+ prt_printf(out, "unwritten entries:");
+ prt_newline(out);
+ bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",
@@ -1352,8 +1420,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1362,7 +1429,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
prt_printf(out, "\tnr\t\t%u\n", ja->nr);
prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2f768e11ae..4544ce24bb 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j)
{
wake_up(&j->wait);
closure_wake_up(&j->async_wait);
- closure_wake_up(&j->preres_wait);
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
@@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res,
static inline bool journal_entry_empty(struct jset *j)
{
- struct jset_entry *i;
-
if (j->seq != j->last_seq)
return false;
@@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 6ab756a485..47805193f1 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset)
}};
}
-static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
{
- return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
- !bch2_crc_cmp(j->csum,
- csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+ if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
+ *csum = (struct bch_csum) {};
+ return false;
+ }
+
+ *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ return !bch2_crc_cmp(j->csum, *csum);
}
static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
@@ -678,17 +683,12 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
- if (i < BCH_DATA_NR)
- prt_printf(out, " %s", bch2_data_types[i]);
- else
- prt_printf(out, " (unknown data type %u)", i);
+ bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
-
- prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
@@ -725,6 +725,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -768,7 +784,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
enum bkey_invalid_flags flags)
{
- struct jset_entry *entry;
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
@@ -920,6 +935,7 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
+ struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -952,7 +968,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
- return 0;
+ goto out;
}
j = buf->data;
@@ -969,12 +985,12 @@ reread:
ret = journal_read_buf_realloc(buf,
vstruct_bytes(j));
if (ret)
- return ret;
+ goto err;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- return 0;
+ goto out;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -983,7 +999,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- return ret;
+ goto err;
}
/*
@@ -993,20 +1009,28 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
+ goto out;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- csum_good = jset_csum_good(c, j);
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
+ struct bch_csum csum;
+ csum_good = jset_csum_good(c, j, &csum);
+
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "journal checksum error"))
+ "%s",
+ (printbuf_reset(&err),
+ prt_str(&err, "journal "),
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+ err.buf)))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret);
+ "error decrypting journal entry: %s",
+ bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1025,7 +1049,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- return ret;
+ goto err;
}
next_block:
pr_debug("next");
@@ -1034,7 +1058,11 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
- return 0;
+out:
+ ret = 0;
+err:
+ printbuf_exit(&err);
+ return ret;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1156,8 +1184,6 @@ int bch2_journal_read(struct bch_fs *c,
struct journal_list jlist;
struct journal_replay *i, **_i, *prev = NULL;
struct genradix_iter radix_iter;
- struct bch_dev *ca;
- unsigned iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
u64 seq;
@@ -1168,7 +1194,7 @@ int bch2_journal_read(struct bch_fs *c,
jlist.last_seq = 0;
jlist.ret = 0;
- for_each_member_device(ca, c, iter) {
+ for_each_member_device(c, ca) {
if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
@@ -1334,7 +1360,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1452,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_need = min_t(unsigned, replicas_want,
+ READ_ONCE(c->opts.metadata_replicas_required));
rcu_read_lock();
retry:
@@ -1500,11 +1528,13 @@ done:
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -EROFS;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
@@ -1512,6 +1542,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (buf->buf_size >= new_size)
return;
+ size_t btree_write_buffer_size = new_size / 64;
+
+ if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+ return;
+
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1604,6 +1639,9 @@ static CLOSURE_CALLBACK(journal_write_done)
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, false);
+
closure_wake_up(&w->wait);
journal_wake(j);
@@ -1656,7 +1694,6 @@ static CLOSURE_CALLBACK(do_journal_write)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_extent_ptr *ptr;
struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
@@ -1700,11 +1737,13 @@ static CLOSURE_CALLBACK(do_journal_write)
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset_entry *start, *end;
struct jset *jset = w->data;
+ struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
- bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
+ bool validate_before_checksum = false;
+ u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
@@ -1715,7 +1754,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* If we wanted to be really fancy here, we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it:
*/
- vstruct_for_each_safe(jset, i, next) {
+ vstruct_for_each(jset, i) {
unsigned u64s = le16_to_cpu(i->u64s);
/* Empty entry: */
@@ -1732,40 +1771,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
- if (i->type == BCH_JSET_ENTRY_btree_root) {
+ switch (i->type) {
+ case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
+ break;
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ EBUG_ON(!w->need_flush_to_write_buffer);
+
+ if (!wb.wb)
+ bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+ struct bkey_i *k;
+ jset_entry_for_each_key(i, k) {
+ ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+ if (ret) {
+ bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ return ret;
+ }
+ }
+ i->type = BCH_JSET_ENTRY_btree_keys;
+ break;
}
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
}
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+ if (wb.wb)
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ w->need_flush_to_write_buffer = false;
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
- bch2_journal_super_entries_add_common(c, &end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1788,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = le64_to_cpu(jset->seq);
+ j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
@@ -1847,7 +1886,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
- w->noflush = true;
+ w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
@@ -1866,12 +1905,11 @@ CLOSURE_CALLBACK(bch2_journal_write)
{
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
- unsigned i, nr_rw_members = 0;
+ unsigned nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1884,12 +1922,16 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
+ mutex_unlock(&j->buf_lock);
if (ret)
goto err;
+ j->entry_bytes_written += vstruct_bytes(w->data);
+
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
@@ -1927,7 +1969,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
nr_rw_members++;
if (nr_rw_members > 1)
@@ -1944,7 +1986,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ec712104ad..c33dca6415 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
@@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static inline void journal_set_watermark(struct journal *j, bool low_on_space)
+void bch2_journal_set_watermark(struct journal *j)
{
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (low_on_space)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool low_on_space = j->space[journal_space_clean].total * 4 <=
+ j->space[journal_space_total].total;
+ bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+ bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+ unsigned watermark = low_on_space || low_on_pin || low_on_wb
+ ? BCH_WATERMARK_reclaim
+ : BCH_WATERMARK_stripe;
+
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+ &j->low_on_space_start, low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+ &j->low_on_pin_start, low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+ &j->write_buffer_full_start, low_on_wb))
+ trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
if (watermark > j->watermark)
@@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
enum journal_space_from from)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned i, pos, nr_devs = 0;
+ unsigned pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr)
continue;
@@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
- unsigned i, nr_online = 0, nr_devs_want;
+ unsigned nr_online = 0, nr_devs_want;
bool can_discard = false;
int ret = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
@@ -201,14 +205,14 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
- if (nr_online < c->opts.metadata_replicas_required) {
+ if (nr_online < metadata_replicas_required(c)) {
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
- for (i = 0; i < journal_space_nr; i++)
+ for (unsigned i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
clean_ondisk = j->space[journal_space_clean_ondisk].total;
@@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- journal_set_watermark(j, clean * 4 <= total);
+ bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
@@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
void bch2_journal_do_discards(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned iter;
mutex_lock(&j->discard_lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
@@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
+ j->pin.front <= j->seq_ondisk &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
j->pin.front++;
popped = true;
@@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other;
}
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ journal_pin_flush_fn flush_fn,
+ enum journal_pin_type type)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ /*
+ * flush_fn is how we identify journal pins in debugfs, so must always
+ * exist, even if it doesn't do anything:
+ */
+ BUG_ON(!flush_fn);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
+ u64 seq = READ_ONCE(src->seq);
+
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return;
}
- pin_list = journal_seq_pin(j, seq);
+ reclaim = __journal_pin_drop(j, dst);
- reclaim = __journal_pin_drop(j, pin);
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
- if (flush_fn)
- list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
- else
- list_add(&pin->list, &pin_list->flushed);
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ BUG_ON(seq < journal_last_seq(j));
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j,
static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
u64 seq_to_flush = 0;
- unsigned iter;
spin_lock(&j->lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j)
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating journal reclaim thread");
+ bch_err_msg(c, ret, "creating journal reclaim thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
j->reclaim_thread = p;
@@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
+ /* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))
@@ -854,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
journal_seq_pin(j, seq)->devs);
seq++;
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, &replicas.e);
- spin_lock(&j->lock);
+ if (replicas.e.nr_devs) {
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ spin_lock(&j->lock);
+ }
}
spin_unlock(&j->lock);
err:
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 494d1a6edd..ec84c33452 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)
@@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-static inline void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- /* Guard against racing with journal_pin_drop(src): */
- u64 seq = READ_ONCE(src->seq);
-
- if (seq)
- bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f9d9aa95bf..0200e299cf 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -267,7 +267,7 @@ retry:
while (!(ret = PTR_ERR_OR_ZERO(b)) &&
b &&
- !test_bit(BCH_FS_STOPPING, &c->flags))
+ !test_bit(BCH_FS_stopping, &c->flags))
b = bch2_btree_iter_next_node(&iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index a756b69582..38817c7a08 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -36,6 +36,7 @@ struct journal_buf {
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
+ bool need_flush_to_write_buffer;
};
/*
@@ -182,6 +183,12 @@ struct journal {
darray_u64 early_journal_entries;
/*
+ * Protects journal_buf->data, when accessing without a jorunal
+ * reservation: for synchronization between the btree write buffer code
+ * and the journal write path:
+ */
+ struct mutex buf_lock;
+ /*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
@@ -195,7 +202,6 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure_waitlist preres_wait;
struct closure io;
struct delayed_work write_work;
@@ -262,15 +268,19 @@ struct journal {
unsigned long last_flush_write;
- u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
+ u64 entry_bytes_written;
+
+ u64 low_on_space_start;
+ u64 low_on_pin_start;
+ u64 max_in_flight_start;
+ u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5699cd4873..1b828bddd1 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *l)
{
- struct bkey_i *k;
-
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
bpos_ge(k->k.p, bkey_next(k)->k.p));
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index fe759c7031..e687e0e9ae 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
}
#define for_each_keylist_key(_keylist, _k) \
- for (_k = (_keylist)->keys; \
+ for (struct bkey_i *_k = (_keylist)->keys; \
_k != (_keylist)->top; \
_k = bkey_next(_k))
static inline u64 keylist_sectors(struct keylist *keys)
{
- struct bkey_i *k;
u64 ret = 0;
for_each_keylist_key(keys, k)
ret += k->k.size;
-
return ret;
}
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 8640f7dee0..ad598105c5 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
int bch2_resume_logged_ops(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter,
+ BTREE_ID_logged_ops, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
resume_logged_op(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
- return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k));
}
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
- int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/*
* This needs to be a fatal error because we've left an unfinished
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
new file mode 100644
index 0000000000..6a4bf7129d
--- /dev/null
+++ b/fs/bcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index a5cc0ed195..7a4ca5a28b 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -147,18 +147,13 @@ fsck_err:
int bch2_check_lrus(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = POS_MIN;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index 1f0801e2e5..bf0ef668fd 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div);
/**
* mean_and_variance_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_get_mean(struct mean_and_variance s)
{
@@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
/**
* mean_and_variance_get_variance() - get variance from @s1
+ * @s1: mean and variance number of samples and sums
*
* see linked pdf equation 12.
*/
@@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
/**
* mean_and_variance_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_get_stddev(struct mean_and_variance s)
{
@@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
/**
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s1: ..
- * @s2: ..
+ * @s: mean and variance number of samples and their sums
+ * @x: new value to include in the &mean_and_variance_weighted
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
@@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
{
@@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
+ * @s: mean and variance number of samples and their sums
*/
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
{
@@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
{
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 056e797383..64df11ab42 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -12,6 +12,9 @@
/*
* u128_u: u128 user mode, because not all architectures support a real int128
* type
+ *
+ * We don't use this version in userspace, because in userspace we link with
+ * Rust and rustc has issues with u128.
*/
#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 019583c3ca..51093fa848 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -130,20 +130,8 @@ static void mean_and_variance_test_1(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 };
- s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
{
s64 d[] = { 100, 100, 100, 100, 100 };
s64 mean[] = { 22, 32, 40, 46, 50 };
@@ -155,18 +143,6 @@ static void mean_and_variance_test_3(struct kunit *test)
d, mean, stddev, weighted_mean, weighted_stddev);
}
-static void mean_and_variance_test_4(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 10, 11, 12, 13, 14 };
- s64 stddev[] = { 9, 13, 15, 17, 19 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
static void mean_and_variance_fast_divpow2(struct kunit *test)
{
s64 i;
@@ -224,8 +200,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
KUNIT_CASE(mean_and_variance_weighted_advanced_test),
KUNIT_CASE(mean_and_variance_test_1),
KUNIT_CASE(mean_and_variance_test_2),
- KUNIT_CASE(mean_and_variance_test_3),
- KUNIT_CASE(mean_and_variance_test_4),
{}
};
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e3a51f6d6c..5623cee3ef 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
int ret = 0;
@@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;
@@ -145,10 +143,9 @@ retry:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "updating btree node key");
+ bch_err_msg(c, ret, "updating btree node key");
+ if (ret)
break;
- }
next:
bch2_btree_iter_next_node(&iter);
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index f3dac4511a..bf68ea4944 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -27,12 +29,53 @@
#include <linux/ioprio.h>
#include <linux/kthread.h>
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DATA_OPS()
+#undef x
+ NULL
+};
+
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs: ");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target: ");
+ prt_tab(out);
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression: ");
+ prt_tab(out);
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas: ");
+ prt_tab(out);
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@@ -63,7 +106,7 @@ struct moving_io {
struct data_update write;
/* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
+ struct bio_vec bi_inline_vecs[];
};
static void move_free(struct moving_io *io)
@@ -104,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
+ if (trace_move_extent_write_enabled()) {
+ struct bch_fs *c = io->write.op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+ trace_move_extent_write(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@@ -211,7 +263,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
trace_move_data(c, stats);
}
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
@@ -234,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ trace_move_extent2(c, k, &io_opts, &data_opts);
+
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
- trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -342,7 +395,8 @@ err:
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
+ count_event(c, move_extent_start_fail);
+
if (trace_move_extent_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
@@ -364,13 +418,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
int ret = 0;
if (io_opts->cur_inum != extent_k.k->p.inode) {
- struct btree_iter iter;
- struct bkey_s_c k;
-
io_opts->d.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
if (k.k->p.offset != extent_k.k->p.inode)
break;
@@ -383,11 +434,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
- ret = darray_push(&io_opts->d, e);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
+ darray_push(&io_opts->d, e);
+ }));
io_opts->cur_inum = extent_k.k->p.inode;
}
@@ -395,12 +443,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (extent_k.k->p.snapshot) {
- struct snapshot_io_opts_entry *i;
+ if (extent_k.k->p.snapshot)
darray_for_each(io_opts->d, i)
if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
return &i->io_opts;
- }
return &io_opts->fs_io_opts;
}
@@ -628,7 +674,7 @@ int bch2_move_data(struct bch_fs *c,
return ret;
}
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
@@ -664,21 +710,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
bch2_trans_iter_exit(trans, &iter);
- if (ret) {
- bch_err_msg(c, ret, "looking up alloc key");
+ bch_err_msg(c, ret, "looking up alloc key");
+ if (ret)
goto err;
- }
a = bch2_alloc_to_v4(k, &a_convert);
- dirty_sectors = a->dirty_sectors;
+ dirty_sectors = bch2_bucket_sectors_dirty(*a);
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru;
- ret = bch2_btree_write_buffer_flush(trans);
- if (ret) {
- bch_err_msg(c, ret, "flushing btree write buffer");
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ bch_err_msg(c, ret, "flushing btree write buffer");
+ if (ret)
goto err;
- }
while (!(ret = bch2_move_ratelimit(ctxt))) {
if (is_kthread && kthread_should_stop())
@@ -697,9 +741,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
break;
if (!bp.level) {
- const struct bch_extent_ptr *ptr;
- unsigned i = 0;
-
k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -722,6 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
data_opts.target = io_opts.background_target;
data_opts.rewrite_ptrs = 0;
+ unsigned i = 0;
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
if (ptr->dev == bucket.inode) {
data_opts.rewrite_ptrs |= 1U << i;
@@ -763,6 +805,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
+ unsigned sectors = btree_ptr_sectors_written(&b->key);
+
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -772,11 +816,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate,
- c->opts.btree_node_size >> 9);
+ bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@@ -789,31 +832,13 @@ err:
return ret;
}
-int bch2_evacuate_bucket(struct bch_fs *c,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc)
-{
- struct moving_context ctxt;
- int ret;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct btree *, struct bch_io_opts *,
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
+ struct bbpos start,
+ struct bbpos end,
move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
@@ -823,7 +848,7 @@ static int bch2_move_btree(struct bch_fs *c,
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
- enum btree_id id;
+ enum btree_id btree;
struct data_update_opts data_opts;
int ret = 0;
@@ -834,15 +859,15 @@ static int bch2_move_btree(struct bch_fs *c,
stats->data_type = BCH_DATA_btree;
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
- id++) {
- stats->pos = BBPOS(id, POS_MIN);
+ for (btree = start.btree;
+ btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+ btree ++) {
+ stats->pos = BBPOS(btree, POS_MIN);
- if (!bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, btree)->b)
continue;
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
@@ -852,8 +877,8 @@ retry:
if (kthread && kthread_should_stop())
break;
- if ((cmp_int(id, end_btree_id) ?:
- bpos_cmp(b->key.k.p, end_pos)) > 0)
+ if ((cmp_int(btree, end.btree) ?:
+ bpos_cmp(b->key.k.p, end.pos)) > 0)
break;
stats->pos = BBPOS(iter.btree_id, iter.pos);
@@ -910,7 +935,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
unsigned i = 0;
@@ -990,8 +1014,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
int ret;
ret = bch2_move_btree(c,
- 0, POS_MIN,
- BTREE_ID_NR, SPOS_MAX,
+ BBPOS_MIN,
+ BBPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
@@ -1006,79 +1030,109 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
return ret;
}
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= replicas) {
+ data_opts->kill_ptrs |= BIT(i);
+ durability -= d;
+ }
+
+ i++;
+ }
+
+ return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
+ struct bbpos start = BBPOS(op.start_btree, op.start_pos);
+ struct bbpos end = BBPOS(op.end_btree, op.end_pos);
int ret = 0;
+ if (op.op >= BCH_DATA_OP_NR)
+ return -EINVAL;
+
+ bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
switch (op.op) {
- case BCH_DATA_OP_REREPLICATE:
- bch2_move_stats_init(stats, "rereplicate");
+ case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_MIGRATE:
+ case BCH_DATA_OP_migrate:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch2_move_stats_init(stats, "rewrite_old_nodes");
+ case BCH_DATA_OP_rewrite_old_nodes:
ret = bch2_scan_old_btree_nodes(c, stats);
- bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_drop_extra_replicas:
+ ret = bch2_move_btree(c, start, end,
+ drop_extra_replicas_btree_pred, c, stats) ?: ret;
+ ret = bch2_move_data(c, start, end, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
ret = -EINVAL;
}
+ bch2_move_stats_exit(stats, c);
return ret;
}
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- prt_printf(out, "%s: data type=%s pos=",
- stats->name,
- bch2_data_types[stats->data_type]);
+ prt_printf(out, "%s: data type==", stats->name);
+ bch2_prt_data_type(out, stats->data_type);
+ prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index c5a7aed2e1..9baf3093a6 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -75,6 +75,8 @@ do { \
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
+extern const char * const bch2_data_ops_strs[];
+
void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
@@ -134,23 +136,17 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct moving_context *,
+int bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
-int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
- struct data_update_opts,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a84e79f79e..69e06a84da 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
- b->sectors = a->dirty_sectors;
+ b->sectors = bch2_bucket_sectors_dirty(*a);
ret = data_type_movable(a->data_type) &&
a->fragmentation_lru &&
@@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
move_buckets_wait(ctxt, buckets_in_flight, false);
- ret = bch2_btree_write_buffer_flush(trans);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ return ret;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
__func__, bch2_err_str(ret)))
return ret;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
@@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
saw++;
- if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+ ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
+ if (ret2 < 0)
+ goto err;
+
+ if (!ret2)
not_movable++;
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
- if (ret2 >= 0)
- sectors += b.sectors;
+ ret2 = darray_push(buckets, b);
+ if (ret2)
+ goto err;
+ sectors += b.sectors;
}
+
+ ret2 = buckets->nr >= nr_to_get;
+err:
ret2;
}));
@@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
- struct move_bucket *i;
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
@@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt,
break;
}
- ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+ ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@@ -259,19 +267,16 @@ err:
*/
unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned dev_idx;
s64 wait = S64_MAX, fragmented_allowed, fragmented;
- unsigned i;
- for_each_rw_member(ca, c, dev_idx) {
+ for_each_rw_member(c, ca) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
ca->mi.bucket_size) >> 1);
fragmented = 0;
- for (i = 0; i < BCH_DATA_NR; i++)
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
fragmented += usage.d[i].fragmented;
@@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg)
if (!buckets)
return -ENOMEM;
ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ bch_err_msg(c, ret, "allocating copygc buckets in flight");
if (ret) {
kfree(buckets);
- bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
@@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg)
if (!c->copy_gc_enabled) {
move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->copy_gc_enabled);
+ kthread_wait_freezable(c->copy_gc_enabled ||
+ kthread_should_stop());
}
if (unlikely(freezing(current))) {
@@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c)
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
ret = PTR_ERR_OR_ZERO(t);
- if (ret) {
- bch_err_msg(c, ret, "creating copygc thread");
+ bch_err_msg(c, ret, "creating copygc thread");
+ if (ret)
return ret;
- }
get_task_struct(t);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8dd4046cca..b1ed0b9a20 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};
@@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: not a multiple of 512",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
if (err)
prt_printf(err, "%s: must be a power of two",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if (opt->fn.validate)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8526f17745..9a4b7faa37 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@@ -233,11 +233,6 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(btree_write_buffer_size, u32, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(16, (1U << 20) - 1), \
- BCH2_NO_SB_OPT, 1U << 13, \
- NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
@@ -394,7 +389,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
@@ -419,6 +414,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allocate the buckets_nouse bitmap") \
+ x(stdio, u64, \
+ 0, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Pointer to a struct stdio_redirect") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
@@ -458,7 +458,13 @@ enum fsck_err_opts {
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
- "to have already been replicated n times")
+ "to have already been replicated n times") \
+ x(btree_node_prefetch, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ " prefetched sequentially")
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
@@ -558,6 +564,11 @@ struct bch_io_opts {
#undef x
};
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+ return opts.background_compression ?: opts.compression;
+}
+
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index accf246c32..b27d229259 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_copy(args2, args);
len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ va_end(args2);
} while (len + 1 >= printbuf_remaining(out) &&
!bch2_printbuf_make_room(out, len + 1));
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a54647c36b..e68b34eab9 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -599,14 +599,9 @@ advance:
int bch2_fs_quota_read(struct bch_fs *c)
{
- struct bch_sb_field_quota *sb_quota;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
return -BCH_ERR_ENOSPC_sb_quota;
@@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c)
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
- POS_MIN, BTREE_ITER_PREFETCH, k,
- __bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_inodes,
- POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- bch2_fs_quota_read_inode(trans, &iter, k));
-
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
new file mode 100644
index 0000000000..dc34347ef6
--- /dev/null
+++ b/fs/bcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index dd6fed2581..22d1017aa4 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
@@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
return bkey_s_c_null;
}
+ if (trace_rebalance_extent_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "target=");
+ bch2_target_to_text(&buf, c, r->target);
+ prt_str(&buf, " compression=");
+ bch2_compression_opt_to_text(&buf, r->compression);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ trace_rebalance_extent(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
return k;
}
@@ -239,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
- compression = io_opts->background_compression ?: io_opts->compression;
+ compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression :
- (io_opts->background_compression ?: io_opts->compression);
+ compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -273,7 +286,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
@@ -371,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
- int ret;
set_freezable();
@@ -379,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
- while (!kthread_should_stop() &&
- !(ret = do_rebalance(&ctxt)))
+ while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);
@@ -456,10 +467,9 @@ int bch2_rebalance_start(struct bch_fs *c)
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating rebalance thread");
+ bch_err_msg(c, ret, "creating rebalance thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5cf7d05320..21e13bb433 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
+ if (k->overwritten)
+ return 0;
+
+ trans->journal_res.seq = k->journal_seq;
+
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
@@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key **keys_sorted, *k;
+ DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- size_t i;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
- if (!keys_sorted)
- return -BCH_ERR_ENOMEM_journal_replay;
-
- for (i = 0; i < keys->nr; i++)
- keys_sorted[i] = &keys->d[i];
-
- sort(keys_sorted, keys->nr,
- sizeof(keys_sorted[0]),
- journal_sort_seq_cmp, NULL);
-
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
@@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
+ /*
+ * First, attempt to replay keys in sorted order. This is more
+ * efficient - better locality of btree access - but some might fail if
+ * that would cause a journal deadlock.
+ */
+ for (size_t i = 0; i < keys->nr; i++) {
+ cond_resched();
+
+ struct journal_key *k = keys->d + i;
+
+ /* Skip fastpath if we're low on space in the journal */
+ ret = c->journal.watermark ? -1 :
+ commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ bch2_journal_replay_key(trans, k));
+ BUG_ON(!ret && !k->overwritten);
+ if (ret) {
+ ret = darray_push(&keys_sorted, k);
+ if (ret)
+ goto err;
+ }
+ }
+ /*
+ * Now, replay any remaining keys in the order in which they appear in
+ * the journal, unpinning those journal entries as we go:
+ */
+ sort(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
+
+ darray_for_each(keys_sorted, kp) {
cond_resched();
+ struct journal_key *k = *kp;
+
replay_now_at(j, k->journal_seq);
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- (!k->allocated
- ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
- : 0),
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ (!k->allocated
+ ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+ : 0),
bch2_journal_replay_key(trans, k));
- if (ret) {
- bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+ bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+ bch2_btree_id_str(k->btree_id), k->level);
+ if (ret)
goto err;
- }
+
+ BUG_ON(!k->overwritten);
}
+ /*
+ * We need to put our btree_trans before calling flush_all_pins(), since
+ * that will use a btree_trans internally
+ */
+ bch2_trans_put(trans);
+ trans = NULL;
+
if (!c->opts.keep_journal)
bch2_journal_keys_put_initial(c);
@@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c)
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
- if (keys->nr && !ret)
+ if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
- kvfree(keys_sorted);
-
- if (ret)
- bch_err_fn(c, ret);
+ if (trans)
+ bch2_trans_put(trans);
+ darray_exit(&keys_sorted);
+ bch_err_fn(c, ret);
return ret;
}
@@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
- c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
@@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
-
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
@@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean)
{
- struct jset_entry *entry;
- int ret;
-
if (clean) {
- for (entry = clean->start;
+ for (struct jset_entry *entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c,
continue;
vstruct_for_each(&i->j, entry) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -474,10 +497,9 @@ err:
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c)
{
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean)
+ return bch2_fs_read_write_early(c);
return 0;
}
@@ -542,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v)
static bool check_version_upgrade(struct bch_fs *c)
{
- unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned latest_compatible = min(latest_version,
+ bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
@@ -562,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c)
new_version = latest_version;
break;
case BCH_VERSION_UPGRADE_none:
- new_version = old_version;
+ new_version = min(old_version, latest_version);
break;
}
}
@@ -589,17 +625,15 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_version_to_text(&buf, new_version);
prt_newline(&buf);
- u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
- if (recovery_passes) {
- if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
- prt_str(&buf, "fsck required");
- else {
- prt_str(&buf, "running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
- }
-
- c->recovery_passes_explicit |= recovery_passes;
- c->opts.fix_errors = FSCK_FIX_yes;
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_upgrade(c, old_version, new_version);
+ passes = ext->recovery_passes_required[0] & ~passes;
+
+ if (passes) {
+ prt_str(&buf, " running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
bch_info(c, "%s", buf.buf);
@@ -625,7 +659,7 @@ u64 bch2_fsck_recovery_passes(void)
static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
return false;
@@ -642,39 +676,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
int ret;
- c->curr_recovery_pass = pass;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
- if (should_run_recovery_pass(c, pass)) {
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ return 0;
+}
- if (!(p->when & PASS_SILENT))
- printk(KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- printk(KERN_CONT " done\n");
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
- c->recovery_passes_complete |= BIT_ULL(pass);
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+ c->curr_recovery_pass++;
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
}
- return 0;
+ return ret;
}
-static int bch2_run_recovery_passes(struct bch_fs *c)
+int bch2_run_online_recovery_passes(struct bch_fs *c)
{
int ret = 0;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
continue;
+ }
if (ret)
break;
- c->curr_recovery_pass++;
}
return ret;
@@ -718,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!(c->opts.nochanges && c->opts.norecovery)) {
+ if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
bool write_sb = false;
@@ -748,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (bch2_check_version_downgrade(c)) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "Version downgrade required:\n");
+ prt_str(&buf, "Version downgrade required:");
__le64 passes = ext->recovery_passes_required[0];
bch2_sb_set_downgrade(c,
@@ -756,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c)
BCH_VERSION_MINOR(c->sb.version));
passes = ext->recovery_passes_required[0] & ~passes;
if (passes) {
- prt_str(&buf, " running recovery passes: ");
+ prt_str(&buf, "\n running recovery passes: ");
prt_bitflags(&buf, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
@@ -779,6 +836,9 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
@@ -919,13 +979,17 @@ use_clean:
if (ret)
goto err;
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
+ test_bit(BCH_FS_errors_fixed, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags)) {
+ bch2_flush_fsck_errs(c);
+
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
- clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ clear_bit(BCH_FS_errors_fixed, &c->flags);
c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
@@ -933,13 +997,13 @@ use_clean:
if (ret)
goto err;
- if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
- test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
+ test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
bch_err(c, "Second fsck run was not clean");
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
}
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
}
if (enabled_qtypes(c)) {
@@ -958,13 +1022,13 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags) &&
+ if (!test_bit(BCH_FS_error, &c->flags) &&
!(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+ if (!test_bit(BCH_FS_error, &c->flags)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (ext &&
(!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
@@ -976,8 +1040,8 @@ use_clean:
}
if (c->opts.fsck &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
write_sb = true;
@@ -993,8 +1057,12 @@ use_clean:
bch2_move_stats_init(&stats, "recovery");
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c) ?:
+ struct printbuf buf = PRINTBUF;
+ bch2_version_to_text(&buf, c->sb.version_min);
+ bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+ printbuf_exit(&buf);
+
+ ret = bch2_fs_read_write_early(c) ?:
bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
@@ -1007,7 +1075,6 @@ use_clean:
ret = 0;
out:
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal &&
@@ -1015,13 +1082,14 @@ out:
bch2_journal_keys_put_initial(c);
kfree(clean);
- if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
+ if (!ret &&
+ test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
+ !c->opts.nochanges) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
err:
fsck_err:
@@ -1034,8 +1102,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
- struct bch_dev *ca;
- unsigned i;
int ret;
bch_notice(c, "initializing new filesystem");
@@ -1054,13 +1120,12 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ set_bit(BCH_FS_may_go_rw, &c->flags);
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
ret = bch2_fs_journal_alloc(c);
@@ -1088,7 +1153,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
ca->new_fs_bucket_idx = 0;
ret = bch2_fs_freespace_init(c);
@@ -1112,10 +1177,9 @@ int bch2_fs_initialize(struct bch_fs *c)
packed_inode.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "creating root directory");
+ bch_err_msg(c, ret, "creating root directory");
+ if (ret)
goto err;
- }
bch2_inode_init_early(c, &lostfound_inode);
@@ -1126,10 +1190,11 @@ int bch2_fs_initialize(struct bch_fs *c)
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL, (subvol_inum) { 0 }, 0));
- if (ret) {
- bch_err_msg(c, ret, "creating lost+found");
+ bch_err_msg(c, ret, "creating lost+found");
+ if (ret)
goto err;
- }
+
+ c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
@@ -1138,10 +1203,9 @@ int bch2_fs_initialize(struct bch_fs *c)
}
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err_msg(c, ret, "writing first journal entry");
+ bch_err_msg(c, ret, "writing first journal entry");
+ if (ret)
goto err;
- }
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
@@ -1152,6 +1216,6 @@ int bch2_fs_initialize(struct bch_fs *c)
return 0;
err:
- bch_err_fn(ca, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 3a554b0751..4e9d24719b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
}
}
+int bch2_run_online_recovery_passes(struct bch_fs *);
u64 bch2_fsck_recovery_passes(void);
int bch2_fs_recovery(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index d37c6fd30e..fa0c8efd2a 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -6,6 +6,7 @@
#define PASS_FSCK BIT(1)
#define PASS_UNCLEAN BIT(2)
#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
/*
* Passes may be reordered, but the second field is a persistent identifier and
@@ -22,18 +23,18 @@
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_FSCK) \
- x(check_lrus, 11, PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_FSCK) \
- x(check_extents_to_backpointers, 14, PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_FSCK) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
- x(check_snapshot_trees, 18, PASS_FSCK) \
- x(check_snapshots, 19, PASS_FSCK) \
- x(check_subvols, 20, PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_FSCK) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
@@ -41,8 +42,8 @@
x(check_indirect_extents, 26, PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_FSCK) \
- x(check_directory_structure, 30, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 37d16e04e6..c47c66c2b3 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "inode.h"
#include "io_misc.h"
@@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
- if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
- prt_printf(err, "idx < front_pad (%llu < %u)",
- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
- return -EINVAL;
- }
-
- return 0;
+ bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
+ c, err, reflink_p_front_pad_bad,
+ "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+fsck_err:
+ return ret;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return true;
}
+static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i *k;
+ __le64 *refcount;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter,
+ BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_WITH_UPDATES);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ refcount = bkey_refcount(bkey_i_to_s(k));
+ if (!refcount) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
+ le64_add_cpu(refcount, add);
+
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ if (ret)
+ goto err;
+
+ *idx = k->k.p.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags, size_t r_idx)
+{
+ struct bch_fs *c = trans->c;
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 start = le64_to_cpu(p.v->idx);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+ s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
+
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
+ goto not_found;
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ *idx = r->offset;
+ return 0;
+not_found:
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ if (next_idx <= start) {
+ bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
+ } else if (*idx >= end) {
+ bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
+ } else {
+ bkey_error_init(update);
+ update->k.p = p.k->p;
+ update->k.p.offset = next_idx;
+ update->k.size = next_idx - *idx;
+ set_bkey_val_u64s(&update->k, 0);
+ }
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
+ }
+
+ *idx = next_idx;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
+
+ u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ while (idx < end && !ret)
+ ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ size_t l = 0, r = c->reflink_gc_nr;
+
+ while (l < r) {
+ size_t m = l + (r - l) / 2;
+ struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (idx < end && !ret)
+ ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
+ }
+
+ return ret;
+}
+
+int bch2_trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
+{
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
-static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
{
if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- set_bkey_val_u64s(&new->k, 0);;
+ new.k->type = KEY_TYPE_deleted;
+ new.k->size = 0;
+ set_bkey_val_u64s(new.k, 0);
*flags &= ~BTREE_TRIGGER_INSERT;
}
}
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- check_indirect_extent_deleting(new, &flags);
-
- if (old.k->type == KEY_TYPE_reflink_v &&
- new->k.type == KEY_TYPE_reflink_v &&
- old.k->u64s == new->k.u64s &&
- !memcmp(bkey_s_c_to_reflink_v(old).v->start,
- bkey_i_to_reflink_v(new)->v.start,
- bkey_val_bytes(&new->k) - 8))
- return 0;
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT))
+ check_indirect_extent_deleting(new, &flags);
- return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+ return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
@@ -152,9 +324,9 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
+ struct bkey_s_c old, struct bkey_s new,
unsigned flags)
{
check_indirect_extent_deleting(new, &flags);
@@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
- refcount = bkey_refcount(r_v);
+ refcount = bkey_refcount(bkey_i_to_s(r_v));
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
@@ -314,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ if (dst_inum.inum < src_inum.inum) {
+ /* Avoid some lock cycle transaction restarts */
+ ret = bch2_btree_iter_traverse(&dst_iter);
+ if (ret)
+ continue;
+ }
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -366,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
@@ -398,7 +575,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
}
bch2_trans_iter_exit(trans, &inode_iter);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8ccf3f9c49..4d88672897 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
- .trans_trigger = bch2_trans_mark_reflink_p, \
- .atomic_trigger = bch2_mark_reflink_p, \
+ .trigger = bch2_trigger_reflink_p, \
.min_val_size = 16, \
})
@@ -23,15 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_reflink_v, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@@ -39,15 +39,15 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *,
+ struct bkey_s_c, struct bkey_s,
unsigned);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
- .trans_trigger = bch2_trans_mark_indirect_inline_data, \
+ .trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})
@@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
}
}
-static inline __le64 *bkey_refcount(struct bkey_i *k)
+static inline __le64 *bkey_refcount(struct bkey_s k)
{
- switch (k->k.type) {
+ switch (k.k->type) {
case KEY_TYPE_reflink_v:
- return &bkey_i_to_reflink_v(k)->v.refcount;
+ return &bkey_s_to_reflink_v(k).v->refcount;
case KEY_TYPE_indirect_inline_data:
- return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ return &bkey_s_to_indirect_inline_data(k).v->refcount;
default:
return NULL;
}
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
new file mode 100644
index 0000000000..6772eebb1f
--- /dev/null
+++ b/fs/bcachefs/reflink_format.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 1c4a8f5c92..cc2672c120 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -17,7 +17,7 @@ static int bch2_memcmp(const void *l, const void *r, size_t size)
/* Replicas tracking - in memory: */
-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
@@ -32,7 +32,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
@@ -45,36 +45,26 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry *e)
+ struct bch_replicas_entry_v1 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb,
struct printbuf *err)
{
@@ -104,7 +94,7 @@ bad:
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_cpu_replicas_entry(r, e) {
@@ -117,7 +107,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
}
static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -137,7 +127,7 @@ static void extent_to_replicas(struct bkey_s_c k,
}
static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
@@ -150,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
@@ -175,12 +165,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
bch2_replicas_entry_sort(e);
}
-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
- unsigned i;
-
BUG_ON(!data_type ||
data_type == BCH_DATA_sb ||
data_type >= BCH_DATA_NR);
@@ -189,8 +177,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
e->nr_devs = 0;
e->nr_required = 1;
- for (i = 0; i < devs.nr; i++)
- e->devs[e->nr_devs++] = devs.devs[i];
+ darray_for_each(devs, i)
+ e->devs[e->nr_devs++] = *i;
bch2_replicas_entry_sort(e);
}
@@ -198,7 +186,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
unsigned i;
struct bch_replicas_cpu new = {
@@ -231,7 +219,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
}
static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
int idx, entry_size = replicas_entry_bytes(search);
@@ -249,7 +237,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
}
int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bch2_replicas_entry_sort(search);
@@ -257,13 +245,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
}
static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
return __replicas_entry_idx(r, search) >= 0;
}
bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bool marked;
@@ -380,7 +368,7 @@ err:
static unsigned reserve_journal_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned journal_res_u64s = 0;
/* nr_inodes: */
@@ -405,7 +393,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
int ret = 0;
@@ -470,7 +458,7 @@ err:
goto out;
}
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r);
@@ -521,7 +509,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned i = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@@ -596,7 +584,7 @@ retry:
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
@@ -627,7 +615,7 @@ retry:
}
int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
u64 sectors)
{
int ret, idx = bch2_replicas_entry_idx(c, r);
@@ -660,7 +648,7 @@ static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
struct bch_replicas_cpu *cpu_r)
{
- struct bch_replicas_entry *e, *dst;
+ struct bch_replicas_entry_v1 *e, *dst;
unsigned nr = 0, entry_size = 0, idx = 0;
for_each_replicas_entry(sb_r, e) {
@@ -698,7 +686,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
nr++;
}
- entry_size += sizeof(struct bch_replicas_entry) -
+ entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0);
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@@ -709,7 +697,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry *dst =
+ struct bch_replicas_entry_v1 *dst =
cpu_replicas_entry(cpu_r, idx++);
dst->data_type = e->data_type;
@@ -753,7 +741,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
{
struct bch_sb_field_replicas_v0 *sb_r;
struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry *src;
+ struct bch_replicas_entry_v1 *src;
size_t bytes;
bytes = sizeof(struct bch_sb_field_replicas);
@@ -791,7 +779,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *dst, *src;
+ struct bch_replicas_entry_v1 *dst, *src;
bool need_v1 = false;
size_t bytes;
@@ -842,7 +830,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
int ret = bch2_replicas_entry_validate(e, sb, err);
@@ -850,7 +838,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
return ret;
if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry *n =
+ struct bch_replicas_entry_v1 *n =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@@ -887,7 +875,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
struct bch_sb_field *f)
{
struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_replicas_entry(r, e) {
@@ -949,7 +937,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, bool print)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool ret = true;
percpu_down_read(&c->mark_lock);
@@ -1009,7 +997,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
- struct bch_replicas_entry *r;
+ struct bch_replicas_entry_v1 *r;
for_each_replicas_entry(replicas, r)
for (i = 0; i < r->nr_devs; i++)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index f70a642775..654a4b26d3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -6,28 +6,28 @@
#include "eytzinger.h"
#include "replicas_types.h"
-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
struct bch_sb *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
@@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d)
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
unsigned dev)
{
e->data_type = BCH_DATA_cached;
@@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *,
u64);
#define for_each_cpu_replicas_entry(_r, _i) \
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 5cfff489bb..ac90d142c4 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -5,12 +5,12 @@
struct bch_replicas_cpu {
unsigned nr;
unsigned entry_size;
- struct bch_replicas_entry *entries;
+ struct bch_replicas_entry_v1 *entries;
};
struct replicas_delta {
s64 delta;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct replicas_delta_list {
@@ -21,7 +21,7 @@ struct replicas_delta_list {
u64 nr_inodes;
u64 persistent_reserved[BCH_REPLICAS_MAX];
struct {} memset_end;
- struct replicas_delta d[0];
+ struct replicas_delta d[];
};
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index c76ad8ea5e..b6bf0ebe7e 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
- struct bch_dev *ca;
- unsigned i, dev;
-
percpu_down_read(&c->mark_lock);
if (!journal_seq) {
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
} else {
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
@@ -210,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{
@@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
@@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
}
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
@@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
"embedded variable length struct");
}
- for_each_member_device(ca, c, dev) {
+ for_each_member_device(c, ca) {
unsigned b = sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
struct jset_entry_dev_usage *u =
@@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_dev_usage, entry);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(dev);
- u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->dev = cpu_to_le32(ca->dev_idx);
- for (i = 0; i < BCH_DATA_NR; i++) {
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
@@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_up_read(&c->mark_lock);
- for (i = 0; i < 2; i++) {
+ for (unsigned i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c
index 02a996e06a..7dc898761b 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
/* BCH_SB_FIELD_counters */
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h
index 4778aa19bf..81f8aec9fc 100644
--- a/fs/bcachefs/counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
-
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
new file mode 100644
index 0000000000..62ea478215
--- /dev/null
+++ b/fs/bcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76) \
+ x(write_buffer_flush_slowpath, 77) \
+ x(write_buffer_flush_sync, 78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4919237bbe..441dcb1bf1 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -12,33 +12,105 @@
#include "sb-errors.h"
#include "super-io.h"
+#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
+
/*
- * Downgrade table:
- * When dowgrading past certain versions, we need to run certain recovery passes
- * and fix certain errors:
+ * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
*
* x(version, recovery_passes, errors...)
*/
+#define UPGRADE_TABLE() \
+ x(backpointers, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_v3, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(unwritten_extents, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(bucket_gens, \
+ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(lru_v2, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(fragmentation_lru, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(no_bps_in_alloc_keys, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_trees, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_skiplists, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
+ BCH_FSCK_ERR_snapshot_bad_depth, \
+ BCH_FSCK_ERR_snapshot_bad_skiplist) \
+ x(deleted_inodes, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
+ x(rebalance_work, \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
#define DOWNGRADE_TABLE()
-struct downgrade_entry {
+struct upgrade_downgrade_entry {
u64 recovery_passes;
u16 version;
u16 nr_errors;
const u16 *errors;
};
-#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
+UPGRADE_TABLE()
+#undef x
+
+static const struct upgrade_downgrade_entry upgrade_table[] = {
+#define x(ver, passes, ...) { \
+ .recovery_passes = passes, \
+ .version = bcachefs_metadata_version_##ver,\
+ .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
+ .errors = upgrade_##ver##_errors, \
+},
+UPGRADE_TABLE()
+#undef x
+};
+
+void bch2_sb_set_upgrade(struct bch_fs *c,
+ unsigned old_version,
+ unsigned new_version)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ for (const struct upgrade_downgrade_entry *i = upgrade_table;
+ i < upgrade_table + ARRAY_SIZE(upgrade_table);
+ i++)
+ if (i->version > old_version && i->version <= new_version) {
+ u64 passes = i->recovery_passes;
+
+ if (passes & RECOVERY_PASS_ALL_FSCK)
+ passes |= bch2_fsck_recovery_passes();
+ passes &= ~RECOVERY_PASS_ALL_FSCK;
+
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(passes));
+
+ for (const u16 *e = i->errors;
+ e < i->errors + i->nr_errors;
+ e++) {
+ __set_bit(*e, c->sb.errors_silent);
+ ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
+ }
+ }
+}
+
+#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x
-static const struct downgrade_entry downgrade_table[] = {
+static const struct upgrade_downgrade_entry downgrade_table[] = {
#define x(ver, passes, ...) { \
.recovery_passes = passes, \
.version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(ver_##errors), \
- .errors = ver_##errors, \
+ .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
+ .errors = downgrade_##ver##_errors, \
},
DOWNGRADE_TABLE()
#undef x
@@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
darray_char table = {};
int ret = 0;
- for (const struct downgrade_entry *src = downgrade_table;
+ for (const struct upgrade_downgrade_entry *src = downgrade_table;
src < downgrade_table + ARRAY_SIZE(downgrade_table);
src++) {
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index bc48fd2ca7..57e6c916fc 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -5,6 +5,7 @@
extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 3504c2d09c..c08aacdfd0 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -248,7 +248,9 @@
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243)
+ x(hash_table_key_wrong_offset, 243) \
+ x(unlinked_inode_not_on_deleted_list, 244) \
+ x(reflink_p_front_pad_bad, 245)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index bed0f857fe..eff5ce18c6 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "(never)");
prt_newline(out);
+ prt_printf(out, "Last superblock write:");
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.seq));
+ prt_newline(out);
+
prt_printf(out, "State:");
prt_tab(out);
prt_printf(out, "%s",
@@ -246,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -254,11 +259,16 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
+ prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
+ prt_str(out, "Durability:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_newline(out);
+
prt_printf(out, "Discard:");
prt_tab(out);
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
@@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- struct bch_dev *ca;
- unsigned i, e;
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
- struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+ for_each_member_device_rcu(c, ca, NULL) {
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
- for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+ for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
rcu_read_unlock();
@@ -413,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = ktime_get_real_seconds();
+ m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 03613e3eb8..be0a941832 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
+#include "darray.h"
+
extern char * const bch2_member_error_strs[];
static inline struct bch_member *
@@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs.nr; i++)
- if (devs.devs[i] == dev)
+ darray_for_each(devs, i)
+ if (*i == dev)
return true;
-
return false;
}
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs->nr; i++)
- if (devs->devs[i] == dev) {
- array_remove_item(devs->devs, devs->nr, i);
+ darray_for_each(*devs, i)
+ if (*i == dev) {
+ darray_remove_item(devs, i);
return;
}
}
@@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
- devs->devs[devs->nr++] = dev;
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
+ devs->data[devs->nr++] = dev;
}
}
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
{
- return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+ return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
}
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
- const struct bch_devs_mask *mask)
+static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
+ const struct bch_devs_mask *mask)
{
struct bch_dev *ca = NULL;
- while ((*iter = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
- : *iter) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[*iter],
+ while ((idx = mask
+ ? find_next_bit(mask->d, c->sb.nr_devices, idx)
+ : idx) < c->sb.nr_devices &&
+ !(ca = rcu_dereference_check(c->devs[idx],
lockdep_is_held(&c->state_lock))))
- (*iter)++;
+ idx++;
return ca;
}
-#define for_each_member_device_rcu(ca, c, iter, mask) \
- for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_devs_mask *mask)
+{
+ return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
+}
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+#define for_each_member_device_rcu(_c, _ca, _mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->ref);
rcu_read_lock();
- if ((ca = __bch2_next_dev(c, iter, NULL)))
+ if ((ca = __bch2_next_dev(c, ca, NULL)))
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter
/*
* If you break early, you must drop your ref on the current device
*/
-#define for_each_member_device(ca, c, iter) \
- for ((iter) = 0; \
- (ca = bch2_get_next_dev(c, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
+#define __for_each_member_device(_c, _ca) \
+ for (; (_ca = bch2_get_next_dev(_c, _ca));)
+
+#define for_each_member_device(_c, _ca) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_dev(_c, _ca));)
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- unsigned *iter,
- int state_mask)
+ struct bch_dev *ca,
+ unsigned state_mask)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
rcu_read_lock();
- while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+ while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
- (*iter)++;
+ ;
rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(ca, c, iter, state_mask) \
- for ((iter) = 0; \
- (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
- percpu_ref_put(&ca->io_ref), (iter)++)
+#define __for_each_online_member(_c, _ca, state_mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
-#define for_each_online_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, ~0)
+#define for_each_online_member(c, ca) \
+ __for_each_online_member(c, ca, ~0)
-#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+#define for_each_rw_member(c, ca) \
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
-#define for_each_readable_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+#define for_each_readable_member(c, ca) \
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
/*
* If a key exists that references a device, the device won't be going away and
@@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
struct bch_devs_mask devs;
- struct bch_dev *ca;
- unsigned i;
memset(&devs, 0, sizeof(devs));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
__set_bit(ca->dev_idx, devs.d);
return devs;
}
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 97790445e6..3a494c5d12 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
}
EXPORT_SYMBOL_GPL(six_relock_ip);
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-static inline bool six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_owner_running(struct six_lock *lock)
{
- struct task_struct *owner;
- bool ret;
-
- if (need_resched())
- return false;
-
+ /*
+ * When there's no owner, we might have preempted between the owner
+ * acquiring the lock and setting the owner field. If we're an RT task
+ * that will live-lock because we won't let the owner complete.
+ */
rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- ret = !owner || owner_on_cpu(owner);
+ struct task_struct *owner = READ_ONCE(lock->owner);
+ bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
rcu_read_unlock();
return ret;
}
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner,
- u64 end_time)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
- bool ret = true;
unsigned loop = 0;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner_on_cpu(owner) || need_resched()) {
- ret = false;
- break;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
u64 end_time;
if (type == SIX_LOCK_write)
return false;
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
+ if (lock->wait_list.next != &wait->list)
+ return false;
- if (!osq_lock(&lock->osq))
- goto fail;
+ if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
+ return false;
+ preempt_disable();
end_time = sched_clock() + 10 * NSEC_PER_USEC;
- while (1) {
- struct task_struct *owner;
-
+ while (!need_resched() && six_owner_running(lock)) {
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
+ * Ensures that writes to the waitlist entry happen after we see
+ * wait->lock_acquired: pairs with the smp_store_release in
+ * __six_lock_wakeup
*/
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner, end_time))
- break;
-
- if (do_six_trylock(lock, type, false)) {
- osq_unlock(&lock->osq);
+ if (smp_load_acquire(&wait->lock_acquired)) {
preempt_enable();
return true;
}
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+ six_set_bitmask(lock, SIX_LOCK_NOSPIN);
break;
+ }
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
cpu_relax();
}
- osq_unlock(&lock->osq);
-fail:
preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
return false;
}
-#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
return false;
}
@@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
trace_contention_begin(lock, 0);
lock_contended(&lock->dep_map, ip);
- if (six_optimistic_spin(lock, type))
- goto out;
-
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
@@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
ret = 0;
}
+ if (six_optimistic_spin(lock, wait, type))
+ goto out;
+
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 4c268b0b83..68d46fd7f3 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -15,7 +15,7 @@
* will have to take write locks for the full duration of the operation.
*
* But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at thte start of the operation,
+ * not with readers, we can take intent locks at the start of the operation,
* and then take write locks only for the actual update to each individual
* nodes, without deadlocking.
*
@@ -65,8 +65,8 @@
*
* Reentrancy:
*
- * Six locks are not by themselves reentrent, but have counters for both the
- * read and intent states that can be used to provide reentrency by an upper
+ * Six locks are not by themselves reentrant, but have counters for both the
+ * read and intent states that can be used to provide reentrancy by an upper
* layer that tracks held locks. If a lock is known to already be held in the
* read or intent state, six_lock_increment() can be used to bump the "lock
* held in this state" counter, increasing the number of unlock calls that
@@ -127,10 +127,6 @@
#include <linux/sched.h>
#include <linux/types.h>
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
-#include <linux/osq_lock.h>
-#endif
-
enum six_lock_type {
SIX_LOCK_read,
SIX_LOCK_intent,
@@ -143,9 +139,6 @@ struct six_lock {
unsigned intent_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
- struct optimistic_spin_queue osq;
-#endif
raw_spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index bf5d6f4e9a..ac6ba04d55 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
struct snapshot_table *t;
bool ret;
- EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+ EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
rcu_read_lock();
t = rcu_dereference(c->snapshots);
@@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
mutex_unlock(&c->snapshot_table_lock);
}
-int bch2_mark_snapshot(struct btree_trans *trans,
+static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
__set_is_ancestor_bitmap(c, id);
if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
@@ -330,6 +330,14 @@ err:
return ret;
}
+int bch2_mark_snapshot(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
@@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_subvolume s;
bool found = false;
int ret;
@@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
continue;
- s = bkey_s_c_to_subvolume(k);
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
continue;
if (!BCH_SUBVOLUME_SNAP(s.v)) {
@@ -582,19 +589,13 @@ fsck_err:
*/
int bch2_check_snapshot_trees(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
-
- if (ret)
- bch_err(c, "error %i checking snapshot trees", ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -727,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans,
return 0;
memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, bkey_val_bytes(k.k));
+ memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
id = le32_to_cpu(s.parent);
if (id) {
@@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans,
real_depth = bch2_snapshot_depth(c, parent_id);
- if (le32_to_cpu(s.depth) != real_depth &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n %s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
+ c, snapshot_bad_depth,
+ "snapshot with incorrect depth field, should be %u:\n %s",
+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (!ret &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+ "snapshot with bad skiplist field:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -856,22 +854,17 @@ fsck_err:
int bch2_check_snapshots(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
/*
* We iterate backwards as checking/fixing the depth field requires that
* the parent's depth already be correct:
*/
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_snapshot(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_snapshots, POS_MAX,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -1060,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
+ n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1067,7 +1062,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
- ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
if (ret)
goto err;
@@ -1315,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
struct bkey_i_snapshot *s;
- u32 *i;
int ret;
if (k.k->type != KEY_TYPE_snapshot)
@@ -1368,23 +1362,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_snapshot snap;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
- u32 *i, id;
+ u32 id;
int ret = 0;
- if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (!test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_fs_read_write_early(c);
- if (ret) {
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ if (ret)
return ret;
- }
}
trans = bch2_trans_get(c);
@@ -1397,37 +1387,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
POS_MIN, 0, k,
NULL, NULL, 0,
bch2_delete_redundant_snapshot(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ bch_err_msg(c, ret, "deleting redundant snapshots");
+ if (ret)
goto err;
- }
- ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
bch2_snapshot_set_equiv(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ if (ret)
goto err;
- }
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
if (k.k->type != KEY_TYPE_snapshot)
continue;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_list_add(c, &deleted, k.k->p.offset);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- bch_err_msg(c, ret, "walking snapshots");
+ BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
+ ? snapshot_list_add(c, &deleted, k.k->p.offset)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
+ if (ret)
goto err;
- }
for (id = 0; id < BTREE_ID_NR; id++) {
struct bpos last_pos = POS_MIN;
@@ -1449,36 +1431,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
- if (ret) {
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (ret)
goto err;
- }
}
bch2_trans_unlock(trans);
down_write(&c->snapshot_create_lock);
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
u32 snapshot = k.k->p.offset;
u32 equiv = bch2_snapshot_equiv(c, snapshot);
- if (equiv != snapshot)
- snapshot_list_add(c, &deleted_interior, snapshot);
- }
- bch2_trans_iter_exit(trans, &iter);
+ equiv != snapshot
+ ? snapshot_list_add(c, &deleted_interior, snapshot)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err_create_lock;
@@ -1489,7 +1471,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
goto err_create_lock;
@@ -1497,19 +1479,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
darray_for_each(deleted, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
darray_for_each(deleted_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
err_create_lock:
up_write(&c->snapshot_create_lock);
@@ -1517,8 +1497,7 @@ err:
darray_exit(&deleted_interior);
darray_exit(&deleted);
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1680,7 +1659,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
if (BCH_SNAPSHOT_DELETED(snap.v) ||
bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
(ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return 0;
}
@@ -1689,21 +1668,16 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
int bch2_snapshots_read(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
bch2_snapshot_set_equiv(trans, k) ?:
bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index f09a22f442..7c66ffc063 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
.val_to_text = bch2_snapshot_to_text, \
- .atomic_trigger = bch2_mark_snapshot, \
+ .trigger = bch2_mark_snapshot, \
.min_val_size = 24, \
})
@@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (*i == id)
return true;
@@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (bch2_snapshot_is_ancestor(c, id, *i))
return true;
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
new file mode 100644
index 0000000000..aabcd3a74c
--- /dev/null
+++ b/fs/bcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+ bch_le128 btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ae21a8cca1..fcaa5a8887 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,6 +15,16 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -150,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
}
static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ unsigned flags, u32 snapshot)
{
struct bkey_s_c k;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
@@ -185,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans,
}
static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key,
+ unsigned flags)
+{
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
@@ -246,7 +264,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- int flags,
+ bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
@@ -269,7 +287,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
}
if (!slot.path &&
- !(flags & BCH_HASH_SET_MUST_REPLACE))
+ !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +305,16 @@ found:
found = true;
not_found:
- if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, 0);
+ ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
@@ -307,7 +325,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
- struct bkey_i *insert, int flags)
+ struct bkey_i *insert,
+ bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
@@ -319,7 +338,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, flags, 0);
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 22b34a8e4d..7c67c28d3e 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans,
return ret;
if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- bch2_fs_lazy_rw(c);
-
ret = bch2_subvolume_delete(trans, iter->pos.offset);
- if (ret)
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
@@ -82,17 +79,12 @@ fsck_err:
int bch2_check_subvols(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_subvol(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
*/
static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_subvolume s;
return lockrestart_do(trans,
@@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
@@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}
@@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
- if (ret) {
- bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ if (ret)
break;
- }
}
darray_exit(&s);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
new file mode 100644
index 0000000000..af79134b07
--- /dev/null
+++ b/fs/bcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ *
+ * This is _not_ necessarily the subvolume of the directory containing
+ * this subvolume:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 2d2e66a4e4..ae644adfc3 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,11 @@ struct snapshot_t {
};
struct snapshot_table {
+#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+ struct snapshot_t s[0];
+#endif
};
typedef struct {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c925dc5742..36988add58 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
-#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
struct bch2_metadata_version {
u16 version;
const char *name;
- u64 recovery_passes;
};
static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v, _recovery_passes) { \
+#define x(n, v) { \
.version = v, \
.name = #n, \
- .recovery_passes = _recovery_passes, \
},
BCH_METADATA_VERSIONS()
#undef x
@@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- u64 ret = 0;
-
- for (const struct bch2_metadata_version *i = bch2_metadata_versions;
- i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
- ret |= bch2_fsck_recovery_passes();
- ret |= i->recovery_passes;
- }
-
- return ret &= ~RECOVERY_PASS_ALL_FSCK;
-}
-
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
{
- struct bch_sb_field *f;
-
/* XXX: need locking around superblock to access optional fields */
vstruct_for_each(sb, f)
@@ -164,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, sb->holder);
+ if (!IS_ERR_OR_NULL(sb->bdev_handle))
+ bdev_release(sb->bdev_handle);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- pr_err("%pg: superblock too big: want %zu but have %llu",
- sb->bdev, new_bytes, max_bytes);
+ struct printbuf buf = PRINTBUF;
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
return -BCH_ERR_ENOSPC_sb;
}
}
@@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
if (sb->fs_sb) {
struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
@@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
int rw)
{
struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field *f;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
@@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb.sb;
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
@@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c)
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
ca->mi = bch2_mi_to_cpu(&m);
}
@@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
dst->time_base_lo = src->time_base_lo;
dst->time_base_hi = src->time_base_hi;
dst->time_precision = src->time_precision;
+ dst->write_time = src->write_time;
memcpy(dst->flags, src->flags, sizeof(dst->flags));
memcpy(dst->features, src->features, sizeof(dst->features));
@@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
- struct bch_csum csum;
size_t bytes;
int ret;
reread:
@@ -650,7 +627,9 @@ reread:
if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
!uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_printf(err, "Not a bcachefs superblock");
+ prt_str(err, "Not a bcachefs superblock (got magic ");
+ pr_uuid(err, sb->sb->magic.b);
+ prt_str(err, ")");
return -BCH_ERR_invalid_sb_magic;
}
@@ -673,17 +652,16 @@ reread:
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
+ if (csum_type >= BCH_CSUM_NR) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
- csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- null_nonce(), sb->sb);
-
+ struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- prt_printf(err, "bad checksum");
+ bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
return -BCH_ERR_invalid_sb_csum;
}
@@ -692,12 +670,13 @@ reread:
return 0;
}
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
+static int __bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
+ struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
@@ -725,21 +704,22 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev) &&
- PTR_ERR(sb->bdev) == -EACCES &&
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->bdev_handle) &&
+ PTR_ERR(sb->bdev_handle) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev))
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->bdev_handle))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev)) {
- ret = PTR_ERR(sb->bdev);
- goto out;
+ if (IS_ERR(sb->bdev_handle)) {
+ ret = PTR_ERR(sb->bdev_handle);
+ goto err;
}
+ sb->bdev = sb->bdev_handle->bdev;
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -760,8 +740,14 @@ retry:
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
+ if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+ printk(KERN_INFO "%s", err2.buf);
+ else
+ printk(KERN_ERR "%s", err2.buf);
+
+ printbuf_exit(&err2);
printbuf_reset(&err);
/*
@@ -837,6 +823,20 @@ err_no_print:
goto out;
}
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, true);
+}
+
/* write superblock: */
static void write_super_endio(struct bio *bio)
@@ -905,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
- struct bch_dev *ca;
struct printbuf err = PRINTBUF;
- unsigned i, sb = 0, nr_wrote;
+ unsigned sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -929,9 +928,14 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
- if (test_bit(BCH_FS_ERROR, &c->flags))
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ for_each_online_member(c, ca)
+ __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
+ if (test_bit(BCH_FS_error, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+ if (test_bit(BCH_FS_topology_error, &c->flags))
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
@@ -942,10 +946,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_sb_from_fs(c, ca);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
printbuf_reset(&err);
ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
@@ -978,16 +982,16 @@ int bch2_write_super(struct bch_fs *c)
return -BCH_ERR_sb_not_downgraded;
}
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
}
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
read_back_super(c, ca);
closure_sync(cl);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
continue;
@@ -1014,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
@@ -1024,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c)
sb++;
} while (wrote);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1036,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c)
can_mount_with_written =
bch2_have_enough_devs(c, sb_written, degraded_flags, false);
- for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
@@ -1193,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
return ret;
}
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
+void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
{
unsigned type = le32_to_cpu(f->type);
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
@@ -1202,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
+ if (ops->to_text)
+ ops->to_text(out, sb, f);
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ unsigned type = le32_to_cpu(f->type);
+
if (type < BCH_SB_FIELD_NR)
prt_printf(out, "%s", bch2_sb_fields[type]);
else
@@ -1210,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, " (size %zu):", vstruct_bytes(f));
prt_newline(out);
- if (ops->to_text) {
- printbuf_indent_add(out, 2);
- ops->to_text(out, sb, f);
- printbuf_indent_sub(out, 2);
- }
+ __bch2_sb_field_to_text(out, sb, f);
}
void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
@@ -1243,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
@@ -1263,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
pr_uuid(out, sb->uuid.b);
prt_newline(out);
+ prt_printf(out, "Magic number:");
+ prt_tab(out);
+ pr_uuid(out, sb->magic.b);
+ prt_newline(out);
+
prt_str(out, "Device index:");
prt_tab(out);
prt_printf(out, "%u", sb->dev_idx);
@@ -1301,9 +1314,16 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
+ prt_printf(out, "Time of last write:");
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+ prt_newline(out);
+
prt_printf(out, "Superblock size:");
prt_tab(out);
- prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_units_u64(out, vstruct_bytes(sb));
+ prt_str(out, "/");
+ prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index e41e5de531..95e80e0631 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, unsigned);
unsigned bch2_latest_compatible_version(unsigned);
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned,
- unsigned);
-
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
return le32_to_cpu(f->u64s) * sizeof(u64);
@@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
bool bch2_check_version_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+ struct bch_sb_field *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 818ec467a0..6b23e11825 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
-#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
+const char * const bch2_fs_flag_strs[] = {
+#define x(n) #n,
+ BCH_FS_FLAGS()
+#undef x
+ NULL
+};
+
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+ va_list args;
+ va_start(args, fmt);
+ if (likely(!stdio)) {
+ vprintk(fmt, args);
+ } else {
+ unsigned long flags;
+
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ spin_lock_irqsave(&stdio->output_lock, flags);
+ prt_vprintf(&stdio->output_buf, fmt, args);
+ spin_unlock_irqrestore(&stdio->output_lock, flags);
+
+ wake_up(&stdio->output_wait);
+ }
+ va_end(args);
+}
+
#define KTYPE(type) \
static const struct attribute_group type ## _group = { \
.attrs = type ## _files \
@@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
mutex_lock(&bch_fs_list_lock);
rcu_read_lock();
list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
goto found;
@@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, u64s =
+ unsigned nr = 0, u64s =
((sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
sizeof(u64);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
nr++;
rcu_read_unlock();
@@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
static void __bch2_fs_read_only(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, clean_passes = 0;
+ unsigned clean_passes = 0;
u64 seq = 0;
bch2_fs_ec_stop(c);
@@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
journal_cur_seq(&c->journal));
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ !test_bit(BCH_FS_emergency_ro, &c->flags))
+ set_bit(BCH_FS_clean_shutdown, &c->flags);
bch2_fs_journal_stop(&c->journal);
/*
* After stopping journal:
*/
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_allocator_remove(c, ca);
}
@@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
}
#endif
void bch2_fs_read_only(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags)) {
+ if (!test_bit(BCH_FS_rw, &c->flags)) {
bch2_journal_reclaim_stop(&c->journal);
return;
}
- BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
+
+ bch_verbose(c, "going read-only");
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
*/
- set_bit(BCH_FS_GOING_RO, &c->flags);
+ set_bit(BCH_FS_going_ro, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_kill(&c->writes);
#else
@@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c)
* that going RO is complete:
*/
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
- test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags) ||
+ test_bit(BCH_FS_emergency_ro, &c->flags));
+
+ bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
+ if (writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
__bch2_fs_read_only(c);
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags));
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- clear_bit(BCH_FS_GOING_RO, &c->flags);
+ if (!writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
+
+ clear_bit(BCH_FS_write_disable_complete, &c->flags);
+ clear_bit(BCH_FS_going_ro, &c->flags);
+ clear_bit(BCH_FS_rw, &c->flags);
if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
- test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_emergency_ro, &c->flags) &&
+ test_bit(BCH_FS_started, &c->flags) &&
+ test_bit(BCH_FS_clean_shutdown, &c->flags) &&
!c->opts.norecovery) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.state.nr);
+ BUG_ON(c->btree_write_buffer.inc.keys.nr);
+ BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
+ } else {
+ bch_verbose(c, "done going read-only, filesystem not clean");
}
-
- clear_bit(BCH_FS_RW, &c->flags);
}
static void bch2_fs_read_only_work(struct work_struct *work)
@@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c)
bool bch2_fs_emergency_read_only(struct bch_fs *c)
{
- bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
bch2_journal_halt(&c->journal);
bch2_fs_read_only_async(c);
@@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
- struct bch_dev *ca;
- unsigned i;
int ret;
- if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+ if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
return -BCH_ERR_erofs_unfixed_errors;
}
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
return 0;
- if (c->opts.norecovery)
- return -BCH_ERR_erofs_norecovery;
-
- /*
- * nochanges is used for fsck -n mode - we have to allow going rw
- * during recovery for that to work:
- */
- if (c->opts.nochanges && (!early || c->opts.read_only))
- return -BCH_ERR_erofs_nochanges;
-
bch_info(c, "going read-write");
ret = bch2_sb_members_v2_init(c);
@@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
* First journal write must be a flush write: after a clean shutdown we
@@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
*/
set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- set_bit(BCH_FS_RW, &c->flags);
- set_bit(BCH_FS_WAS_RW, &c->flags);
+ set_bit(BCH_FS_rw, &c->flags);
+ set_bit(BCH_FS_was_rw, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
- for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
@@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_do_pending_node_rewrites(c);
return 0;
err:
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
bch2_fs_read_only(c);
else
__bch2_fs_read_only(c);
@@ -472,6 +497,12 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
+ if (c->opts.norecovery)
+ return -BCH_ERR_erofs_norecovery;
+
+ if (c->opts.nochanges)
+ return -BCH_ERR_erofs_nochanges;
+
return __bch2_fs_read_write(c, false);
}
@@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj)
void __bch2_fs_stop(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
bch_verbose(c, "shutting down");
- set_bit(BCH_FS_STOPPING, &c->flags);
+ set_bit(BCH_FS_stopping, &c->flags);
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
@@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_read_only(c);
up_write(&c->state_lock);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
@@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ bch2_ro_ref_put(c);
+ wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
+
kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
@@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->read_only_work);
@@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c)
static int bch2_fs_online(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
lockdep_assert_held(&bch_fs_list_lock);
@@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
@@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c)
down_write(&c->state_lock);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
@@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
+ c->stdio = (void *)(unsigned long) opts.stdio;
+
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
@@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+ refcount_set(&c->ro_ref, 1);
+ init_waitqueue_head(&c->ro_ref_wait);
+ sema_init(&c->online_fsck_mutex, 1);
+
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
atomic_set(&c->journal_keys.ref, 1);
@@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
@@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -847,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- btree_bytes(c)) ||
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c)
int bch2_fs_start(struct bch_fs *c)
{
- struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
- unsigned i;
int ret;
print_mount_opts(c);
down_write(&c->state_lock);
- BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+ BUG_ON(test_bit(BCH_FS_started, &c->flags));
mutex_lock(&c->sb_lock);
@@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- for_each_online_member(ca, c, i)
- bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+ for_each_online_member(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- set_bit(BCH_FS_STARTED, &c->flags);
+ set_bit(BCH_FS_started, &c->flags);
- if (c->opts.read_only || c->opts.nochanges) {
+ if (c->opts.read_only) {
bch2_fs_read_only(c);
} else {
- ret = !test_bit(BCH_FS_RW, &c->flags)
+ ret = !test_bit(BCH_FS_rw, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
if (ret)
@@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c)
}
ret = 0;
-out:
+err:
+ if (ret)
+ bch_err_msg(c, ret, "starting filesystem");
+ else
+ bch_verbose(c, "done starting filesystem");
up_write(&c->state_lock);
return ret;
-err:
- bch_err_msg(c, ret, "starting filesystem");
- goto out;
}
static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
@@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
return 0;
}
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+ struct bch_sb_handle *sb)
{
- struct bch_sb *newest =
- le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+ if (fs == sb)
+ return 0;
- if (!uuid_equal(&fs->uuid, &sb->uuid))
+ if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(newest, sb->dev_idx))
+ if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
- if (fs->block_size != sb->block_size)
+ if (fs->sb->block_size != sb->sb->block_size)
return -BCH_ERR_mismatched_block_size;
+ if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
+ le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
+ return 0;
+
+ if (fs->sb->seq == sb->sb->seq &&
+ fs->sb->write_time != sb->sb->write_time) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+ prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Not using older sb");
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
+ struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+ u64 seq_from_fs = le64_to_cpu(m.seq);
+ u64 seq_from_member = le64_to_cpu(sb->sb->seq);
+
+ if (seq_from_fs && seq_from_fs < seq_from_member) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_str(&buf, "believes seq of ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " to be %llu, but ", seq_from_fs);
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " has %llu\n", seq_from_member);
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
return 0;
}
@@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
bch2_dev_sysfs_online(c, ca);
+ struct printbuf name = PRINTBUF;
+ prt_bdevname(&name, ca->disk_sb.bdev);
+
if (c->sb.nr_devices == 1)
- snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
- snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ strscpy(ca->name, name.buf, sizeof(ca->name));
+
+ printbuf_exit(&name);
rebalance_wakeup(c);
return 0;
@@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_devs_mask new_online_devs;
- struct bch_dev *ca2;
- int i, nr_rw = 0, required;
+ int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
@@ -1320,16 +1422,16 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true;
/* do we have enough devices to write to? */
- for_each_member_device(ca2, c, i)
+ for_each_member_device(c, ca2)
if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
- : c->opts.metadata_replicas_required,
+ : metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
- : c->opts.data_replicas_required);
+ : data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
@@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_NORUN, NULL);
- if (ret)
- bch_err_msg(c, ret, "removing dev alloc info");
-
+ bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1497,40 +1597,35 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- if (ret) {
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "dropping data");
+ if (ret)
goto err;
- }
ret = bch2_dev_remove_alloc(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "deleting alloc info");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- if (ret) {
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "flushing journal");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err(ca, "journal error");
+ bch_err(ca, "journal error");
+ if (ret)
goto err;
- }
ret = bch2_replicas_gc2(c);
- if (ret) {
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "in replicas_gc2()");
+ if (ret)
goto err;
- }
data = bch2_dev_has_data(c, ca);
if (data) {
struct printbuf data_has = PRINTBUF;
- prt_bitflags(&data_has, bch2_data_types, data);
+ prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;
@@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
int ret;
ret = bch2_read_super(path, &opts, &sb);
- if (ret) {
- bch_err_msg(c, ret, "reading super");
+ bch_err_msg(c, ret, "reading super");
+ if (ret)
goto err;
- }
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
@@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
ret = bch2_dev_may_add(sb.sb, c);
- if (ret) {
- bch_err_fn(c, ret);
+ if (ret)
goto err;
- }
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
@@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- bch_err_msg(c, ret, "allocating journal");
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
goto err;
- }
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
ret = bch2_sb_from_fs(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "setting up new superblock");
+ bch_err_msg(c, ret, "setting up new superblock");
+ if (ret)
goto err_unlock;
- }
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
@@ -1681,10 +1771,9 @@ have_slot:
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
- if (ret) {
- bch_err_msg(c, ret, "creating new label");
+ bch_err_msg(c, ret, "creating new label");
+ if (ret)
goto err_unlock;
- }
}
bch2_write_super(c);
@@ -1693,16 +1782,14 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "marking new superblock");
+ bch_err_msg(ca, ret, "marking new superblock");
+ if (ret)
goto err_late;
- }
ret = bch2_fs_freespace_init(c);
- if (ret) {
- bch_err_msg(ca, ret, "initializing free space");
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
goto err_late;
- }
ca->new_fs_bucket_idx = 0;
@@ -1721,6 +1808,7 @@ err:
bch2_free_super(&sb);
printbuf_exit(&label);
printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
return ret;
err_late:
up_write(&c->state_lock);
@@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online", path);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ bch_err_msg(c, ret, "bringing %s online", path);
+ if (ret)
goto err;
- }
ret = bch2_dev_attach_bdev(c, &sb);
if (ret)
@@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
ca = bch_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ if (ret)
goto err;
- }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
@@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- if (ret) {
- bch_err_msg(ca, ret, "resizing buckets");
+ bch_err_msg(ca, ret, "resizing buckets");
+ if (ret)
goto err;
- }
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret)
@@ -1879,28 +1964,30 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
- if (!strcmp(name, ca->name))
- goto found;
- ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-found:
+ for_each_member_device_rcu(c, ca, NULL)
+ if (!strcmp(name, ca->name)) {
+ rcu_read_unlock();
+ return ca;
+ }
rcu_read_unlock();
-
- return ca;
+ return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
/* Filesystem open: */
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+ return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+ cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
- struct bch_sb_handle *sb, *best = NULL;
+ struct bch_sb_handle *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
@@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
BUG_ON(darray_push(&sbs, sb));
}
+ if (opts.nochanges && !opts.read_only) {
+ ret = -BCH_ERR_erofs_nochanges;
+ goto err_print;
+ }
+
darray_for_each(sbs, sb)
- if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
darray_for_each_reverse(sbs, sb) {
- if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb->bdev);
+ ret = bch2_dev_in_fs(best, sb);
+
+ if (ret == -BCH_ERR_device_has_been_removed ||
+ ret == -BCH_ERR_device_splitbrain) {
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
+ ret = 0;
continue;
}
- ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
}
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index bf762df180..dada09331d 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -8,6 +8,8 @@
#include <linux/math64.h>
+extern const char * const bch2_fs_flag_strs[];
+
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
@@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
*/
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags) &&
- !test_bit(BCH_FS_WAS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags) &&
+ !test_bit(BCH_FS_was_rw, &c->flags))
bch2_fs_read_write_early(c);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 9c1fd4ca2b..0e5a14fc8e 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,6 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
@@ -22,7 +23,7 @@ struct bch_devs_mask {
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_BKEY_PTRS_MAX];
+ u8 data[BCH_BKEY_PTRS_MAX];
};
struct bch_member_cpu {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f3cb7115b5..cee80c47fe 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -145,6 +146,7 @@ rw_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
+read_attribute(flags);
read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
@@ -246,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
- ret += btree_bytes(c);
+ ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@@ -255,19 +257,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
- u64 nr_uncompressed_extents = 0,
- nr_compressed_extents = 0,
- nr_incompressible_extents = 0,
- uncompressed_sectors = 0,
- incompressible_sectors = 0,
- compressed_sectors_compressed = 0,
- compressed_sectors_uncompressed = 0;
+ struct compression_type_stats {
+ u64 nr_extents;
+ u64 sectors_compressed;
+ u64 sectors_uncompressed;
+ } s[BCH_COMPRESSION_TYPE_NR];
+ u64 compressed_incompressible = 0;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ memset(s, 0, sizeof(s));
+
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
trans = bch2_trans_get(c);
@@ -276,39 +277,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (!btree_type_has_ptrs(id))
continue;
- ret = for_each_btree_key2(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = for_each_btree_key(trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bool compressed = false, uncompressed = false, incompressible = false;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- switch (p.crc.compression_type) {
- case BCH_COMPRESSION_TYPE_none:
- uncompressed = true;
- uncompressed_sectors += k.k->size;
- break;
- case BCH_COMPRESSION_TYPE_incompressible:
- incompressible = true;
- incompressible_sectors += k.k->size;
- break;
- default:
- compressed_sectors_compressed +=
- p.crc.compressed_size;
- compressed_sectors_uncompressed +=
- p.crc.uncompressed_size;
- compressed = true;
- break;
+ bool compressed = false, incompressible = false;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry) {
+ incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
+ compressed |= crc_is_compressed(crc);
+
+ if (crc_is_compressed(crc)) {
+ s[crc.compression_type].nr_extents++;
+ s[crc.compression_type].sectors_compressed += crc.compressed_size;
+ s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
}
}
- if (incompressible)
- nr_incompressible_extents++;
- else if (uncompressed)
- nr_uncompressed_extents++;
- else if (compressed)
- nr_compressed_extents++;
+ compressed_incompressible += compressed && incompressible;
+
+ if (!compressed) {
+ unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
+
+ s[t].nr_extents++;
+ s[t].sectors_compressed += k.k->size;
+ s[t].sectors_uncompressed += k.k->size;
+ }
0;
}));
}
@@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- prt_printf(out, "uncompressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, uncompressed_sectors << 9);
- prt_printf(out, "\n");
+ prt_str(out, "type");
+ printbuf_tabstop_push(out, 12);
+ prt_tab(out);
- prt_printf(out, "compressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
- prt_printf(out, " compressed size: ");
- prt_human_readable_u64(out, compressed_sectors_compressed << 9);
- prt_printf(out, "\n");
- prt_printf(out, " uncompressed size: ");
- prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
- prt_printf(out, "\n");
+ prt_str(out, "compressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "uncompressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "average extent size");
+ printbuf_tabstop_push(out, 24);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
+ bch2_prt_compression_type(out, i);
+ prt_tab(out);
+
+ prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].nr_extents
+ ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+ : 0);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ if (compressed_incompressible) {
+ prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
+ prt_newline(out);
+ }
- prt_printf(out, "incompressible:\n");
- prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, incompressible_sectors << 9);
- prt_printf(out, "\n");
return 0;
}
@@ -370,6 +384,9 @@ SHOW(bch2_fs)
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
+ if (attr == &sysfs_flags)
+ prt_bitflags(out, bch2_fs_flag_strs, c->flags);
+
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
if (attr == &sysfs_btree_write_stats)
@@ -483,12 +500,12 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
/* Debugging: */
- if (!test_bit(BCH_FS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -620,6 +637,7 @@ STORE(bch2_fs_internal)
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
+ &sysfs_flags,
&sysfs_journal_debug,
&sysfs_btree_updates,
&sysfs_btree_cache,
@@ -708,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
- if ((id == Opt_background_target ||
- id == Opt_background_compression) && v)
+ if (v &&
+ (id == Opt_background_target ||
+ id == Opt_background_compression ||
+ (id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@@ -786,32 +806,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- prt_str(out, bch2_data_types[i]);
- prt_tab(out);
- prt_u64(out, stats.d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_str(out, "ec");
- prt_tab(out);
- prt_u64(out, stats.buckets_ec);
- prt_tab_rjust(out);
- prt_newline(out);
+ bch2_dev_usage_to_text(out, &stats);
prt_newline(out);
@@ -891,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
- bch2_data_types[i],
+ bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@@ -916,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
- prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 2fc9e60c75..b3fe9fc577 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -107,9 +107,6 @@ err:
static int test_iterate(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i++);
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i++);
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != --i);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i);
- i = k.k->p.offset;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i += 2;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i);
+ i += 2;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr * 2);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i >= nr * 2)
- break;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
- BUG_ON(k.k->p.offset != i);
- BUG_ON(bkey_deleted(k.k) != (i & 1));
+ BUG_ON(k.k->p.offset != i);
+ BUG_ON(bkey_deleted(k.k) != (i & 1));
- i++;
- 0;
- }));
- if (ret < 0) {
- bch_err_msg(c, ret, "error iterating forwards by slots");
- goto err;
- }
- ret = 0;
-err:
- bch2_trans_put(trans);
+ i++;
+ 0;
+ })));
+ bch_err_msg(c, ret, "error iterating forwards by slots");
return ret;
}
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i + 8);
- BUG_ON(k.k->size != 8);
- i += 16;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i + 8);
+ BUG_ON(k.k->size != 8);
+ i += 16;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i == nr)
- break;
- BUG_ON(bkey_deleted(k.k) != !(i % 16));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
+ BUG_ON(bkey_deleted(k.k) != !(i % 16));
- BUG_ON(bkey_start_offset(k.k) != i);
- BUG_ON(k.k->size != 8);
- i = k.k->p.offset;
- 0;
- }));
+ BUG_ON(bkey_start_offset(k.k) != i);
+ BUG_ON(k.k->size != 8);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards by slots");
- if (ret)
- goto err;
- ret = 0;
-err:
- bch2_trans_put(trans);
- return 0;
+ return ret;
}
/*
@@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr)
static int seq_insert(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_cookie insert;
bkey_cookie_init(&insert.k_i);
@@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
- for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
@@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
static int seq_overwrite(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
new file mode 100644
index 0000000000..9220d7de10
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "printbuf.h"
+#include "thread_with_file.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+
+void bch2_thread_with_file_exit(struct thread_with_file *thr)
+{
+ if (thr->task) {
+ kthread_stop(thr->task);
+ put_task_struct(thr->task);
+ }
+}
+
+int bch2_run_thread_with_file(struct thread_with_file *thr,
+ const struct file_operations *fops,
+ int (*fn)(void *))
+{
+ struct file *file = NULL;
+ int ret, fd = -1;
+ unsigned fd_flags = O_CLOEXEC;
+
+ if (fops->read && fops->write)
+ fd_flags |= O_RDWR;
+ else if (fops->read)
+ fd_flags |= O_RDONLY;
+ else if (fops->write)
+ fd_flags |= O_WRONLY;
+
+ char name[TASK_COMM_LEN];
+ get_task_comm(name, current);
+
+ thr->ret = 0;
+ thr->task = kthread_create(fn, thr, "%s", name);
+ ret = PTR_ERR_OR_ZERO(thr->task);
+ if (ret)
+ return ret;
+
+ ret = get_unused_fd_flags(fd_flags);
+ if (ret < 0)
+ goto err;
+ fd = ret;
+
+ file = anon_inode_getfile(name, fops, thr, fd_flags);
+ ret = PTR_ERR_OR_ZERO(file);
+ if (ret)
+ goto err;
+
+ get_task_struct(thr->task);
+ wake_up_process(thr->task);
+ fd_install(fd, file);
+ return fd;
+err:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ if (thr->task)
+ kthread_stop(thr->task);
+ return ret;
+}
+
+static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+{
+ return thr->stdio.output_buf.pos ||
+ thr->output2.nr ||
+ thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ size_t copied = 0, b;
+ int ret = 0;
+
+ if ((file->f_flags & O_NONBLOCK) &&
+ !thread_with_stdio_has_output(thr))
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(thr->stdio.output_wait,
+ thread_with_stdio_has_output(thr));
+ if (ret)
+ return ret;
+
+ if (thr->thr.done)
+ return 0;
+
+ while (len) {
+ ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
+ if (ret)
+ break;
+
+ spin_lock_irq(&thr->stdio.output_lock);
+ b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+
+ memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
+ memmove(thr->stdio.output_buf.buf,
+ thr->stdio.output_buf.buf + b,
+ thr->stdio.output_buf.pos - b);
+
+ thr->output2.nr += b;
+ thr->stdio.output_buf.pos -= b;
+ spin_unlock_irq(&thr->stdio.output_lock);
+
+ b = min(len, thr->output2.nr);
+ if (!b)
+ break;
+
+ b -= copy_to_user(buf, thr->output2.data, b);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ copied += b;
+ buf += b;
+ len -= b;
+
+ memmove(thr->output2.data,
+ thr->output2.data + b,
+ thr->output2.nr - b);
+ thr->output2.nr -= b;
+ }
+
+ return copied ?: ret;
+}
+
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ bch2_thread_with_file_exit(&thr->thr);
+ printbuf_exit(&thr->stdio.input_buf);
+ printbuf_exit(&thr->stdio.output_buf);
+ darray_exit(&thr->output2);
+ thr->exit(thr);
+ return 0;
+}
+
+#define WRITE_BUFFER 4096
+
+static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
+{
+ return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct printbuf *buf = &thr->stdio.input_buf;
+ size_t copied = 0;
+ ssize_t ret = 0;
+
+ while (len) {
+ if (thr->thr.done) {
+ ret = -EPIPE;
+ break;
+ }
+
+ size_t b = len - fault_in_readable(ubuf, len);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ spin_lock(&thr->stdio.input_lock);
+ if (buf->pos < WRITE_BUFFER)
+ bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
+ b = min(len, printbuf_remaining_size(buf));
+
+ if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->pos += b;
+ }
+ spin_unlock(&thr->stdio.input_lock);
+
+ if (b) {
+ wake_up(&thr->stdio.input_wait);
+ } else {
+ if ((file->f_flags & O_NONBLOCK)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ ret = wait_event_interruptible(thr->stdio.input_wait,
+ thread_with_stdio_has_input_space(thr));
+ if (ret)
+ break;
+ }
+ }
+
+ return copied ?: ret;
+}
+
+static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output_wait, wait);
+ poll_wait(file, &thr->stdio.input_wait, wait);
+
+ __poll_t mask = 0;
+
+ if (thread_with_stdio_has_output(thr))
+ mask |= EPOLLIN;
+ if (thread_with_stdio_has_input_space(thr))
+ mask |= EPOLLOUT;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static const struct file_operations thread_with_stdio_fops = {
+ .release = thread_with_stdio_release,
+ .read = thread_with_stdio_read,
+ .write = thread_with_stdio_write,
+ .poll = thread_with_stdio_poll,
+ .llseek = no_llseek,
+};
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *))
+{
+ thr->stdio.input_buf = PRINTBUF;
+ thr->stdio.input_buf.atomic++;
+ spin_lock_init(&thr->stdio.input_lock);
+ init_waitqueue_head(&thr->stdio.input_wait);
+
+ thr->stdio.output_buf = PRINTBUF;
+ thr->stdio.output_buf.atomic++;
+ spin_lock_init(&thr->stdio.output_lock);
+ init_waitqueue_head(&thr->stdio.output_wait);
+
+ darray_init(&thr->output2);
+ thr->exit = exit;
+
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+}
+
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ if (n)
+ ret = min(ret, n + 1 - stdio->input_buf.buf);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
new file mode 100644
index 0000000000..05879c5048
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_H
+#define _BCACHEFS_THREAD_WITH_FILE_H
+
+#include "thread_with_file_types.h"
+
+struct task_struct;
+
+struct thread_with_file {
+ struct task_struct *task;
+ int ret;
+ bool done;
+};
+
+void bch2_thread_with_file_exit(struct thread_with_file *);
+int bch2_run_thread_with_file(struct thread_with_file *,
+ const struct file_operations *,
+ int (*fn)(void *));
+
+struct thread_with_stdio {
+ struct thread_with_file thr;
+ struct stdio_redirect stdio;
+ DARRAY(char) output2;
+ void (*exit)(struct thread_with_stdio *);
+};
+
+static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input_wait);
+ wake_up(&thr->stdio.output_wait);
+}
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *));
+int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
new file mode 100644
index 0000000000..90b5e645e9
--- /dev/null
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+
+struct stdio_redirect {
+ spinlock_t output_lock;
+ wait_queue_head_t output_wait;
+ struct printbuf output_buf;
+
+ spinlock_t input_lock;
+ wait_queue_head_t input_wait;
+ struct printbuf input_buf;
+ bool done;
+};
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index fd49b63562..293b90d704 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos,
TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
);
-DECLARE_EVENT_CLASS(bkey,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k),
+DECLARE_EVENT_CLASS(fs_str,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str),
TP_STRUCT__entry(
- __string(k, k )
+ __field(dev_t, dev )
+ __string(str, str )
),
TP_fast_assign(
- __assign_str(k, k);
+ __entry->dev = c->dev;
+ __assign_str(str, str);
),
- TP_printk("%s", __get_str(k))
+ TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
-DECLARE_EVENT_CLASS(btree_node,
+DECLARE_EVENT_CLASS(trans_str,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %pS %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(trans_str_nocaller,
+ TP_PROTO(struct btree_trans *trans, const char *str),
+ TP_ARGS(trans, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(btree_node_nofs,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
@@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node,
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %s %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
+ __entry->level,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs,
TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
+DECLARE_EVENT_CLASS(btree_trans,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ ),
+
+ TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
+);
+
DECLARE_EVENT_CLASS(bio,
TP_PROTO(struct bio *bio),
TP_ARGS(bio),
@@ -183,9 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, journal_entry_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, journal_entry_close,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@@ -286,36 +381,36 @@ TRACE_EVENT(btree_cache_scan,
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_cache_reap,
+DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
/* Btree */
DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_node_write,
@@ -339,13 +434,13 @@ TRACE_EVENT(btree_node_write,
);
DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_reserve_get_fail,
@@ -377,28 +472,28 @@ TRACE_EVENT(btree_reserve_get_fail,
);
DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_path_relock_fail,
@@ -433,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@@ -717,44 +812,32 @@ TRACE_EVENT(bucket_evacuate,
__entry->dev_idx, __entry->bucket)
);
-DEFINE_EVENT(bkey, move_extent,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_read,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_read,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_write,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_write,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_finish,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_finish,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(move_extent_fail,
- TP_PROTO(struct bch_fs *c, const char *msg),
- TP_ARGS(c, msg),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(msg, msg )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(msg, msg);
- ),
-
- TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_start_fail,
+DEFINE_EVENT(fs_str, move_extent_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@@ -930,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -987,10 +1070,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ unsigned long caller_ip,
+ const char *paths),
+ TP_ARGS(trans, caller_ip, paths)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
@@ -1036,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-struct get_locks_fail;
-
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1056,8 +1138,6 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, level )
__field(u32, path_seq )
__field(u32, node_seq )
- __field(u32, path_alloc_seq )
- __field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
@@ -1070,12 +1150,10 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->level = f->l;
__entry->path_seq = path->l[f->l].lock_seq;
__entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- __entry->path_alloc_seq = path->alloc_seq;
- __entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id),
@@ -1086,16 +1164,12 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->new_locks_want,
__entry->level,
__entry->path_seq,
- __entry->node_seq,
- __entry->path_alloc_seq,
- __entry->downgrade_seq)
+ __entry->node_seq)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
@@ -1160,10 +1234,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ const char *cycle),
+ TP_ARGS(trans, cycle)
);
DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
@@ -1252,22 +1326,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path),
+ struct btree_path *path,
+ unsigned old_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
+ __field(unsigned, old_locks_want )
+ __field(unsigned, new_locks_want )
+ __field(unsigned, btree )
+ TRACE_BPOS_entries(pos)
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = path->locks_want;
+ __entry->btree = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
),
- TP_printk("%s %pS",
+ TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
__entry->trans_fn,
- (void *) __entry->caller_ip)
+ (void *) __entry->caller_ip,
+ __entry->old_locks_want,
+ __entry->new_locks_want,
+ bch2_btree_id_str(__entry->btree),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
@@ -1298,21 +1387,48 @@ TRACE_EVENT(write_buffer_flush,
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
+TRACE_EVENT(write_buffer_flush_sync,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
- TP_ARGS(trans, nr, size),
+ TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+ TP_ARGS(trans, slowpath, total),
TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, size )
+ __field(size_t, slowpath )
+ __field(size_t, total )
),
TP_fast_assign(
- __entry->nr = nr;
- __entry->size = size;
+ __entry->slowpath = slowpath;
+ __entry->total = total;
),
- TP_printk("%zu/%zu", __entry->nr, __entry->size)
+ TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
+);
+
+DEFINE_EVENT(fs_str, rebalance_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, data_update,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#endif /* _TRACE_BCACHEFS_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3b7c349f26..3a32faa86b 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+ bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
console_unlock();
}
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+ gfp_t gfp)
{
#ifdef CONFIG_STACKTRACE
unsigned nr_entries = 0;
- int ret = 0;
stack->nr = 0;
- ret = darray_make_room(stack, 32);
+ int ret = darray_make_room_gfp(stack, 32, gfp);
if (ret)
return ret;
@@ -282,9 +287,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
return -1;
do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
} while (nr_entries == stack->size &&
- !(ret = darray_make_room(stack, stack->size * 2)));
+ !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
stack->nr = nr_entries;
up_read(&task->signal->exec_update_lock);
@@ -297,24 +302,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
{
- unsigned long *i;
-
darray_for_each(*stack, i) {
prt_printf(out, "[<0>] %pB", (void *) *i);
prt_newline(out);
}
}
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
{
bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task);
+ int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
bch2_prt_backtrace(out, &stack);
darray_exit(&stack);
return ret;
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ strim(buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -359,6 +414,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
@@ -373,29 +429,33 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
stats->last_event = end;
}
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ for (struct bch2_time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
- struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
+ __bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
- WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
- "time_stats: min_duration = %llu, min_freq = %llu",
- stats->min_duration, stats->min_freq);
+ WARN_ONCE(!stats->duration_stats_weighted.weight ||
+ !stats->freq_stats_weighted.weight,
+ "uninitialized time_stats");
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
@@ -424,40 +484,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
-#endif
-
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@@ -468,26 +494,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-#define TABSTOP_SIZE 12
-
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
@@ -496,12 +502,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out);
}
+#define TABSTOP_SIZE 12
+
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
/*
* avoid divide by zero
*/
@@ -547,6 +565,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
+ pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
@@ -604,6 +623,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q;
}
}
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
@@ -1158,3 +1180,39 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
return ret;
}
+
+void bch2_darray_str_exit(darray_str *d)
+{
+ darray_for_each(*d, i)
+ kfree(*i);
+ darray_exit(d);
+}
+
+int bch2_split_devs(const char *_dev_name, darray_str *ret)
+{
+ darray_init(ret);
+
+ char *dev_name, *s, *orig;
+
+ dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ return -ENOMEM;
+
+ while ((s = strsep(&dev_name, ":"))) {
+ char *p = kstrdup(s, GFP_KERNEL);
+ if (!p)
+ goto err;
+
+ if (darray_push(ret, p)) {
+ kfree(p);
+ goto err;
+ }
+ }
+
+ kfree(orig);
+ return 0;
+err:
+ bch2_darray_str_exit(ret);
+ kfree(orig);
+ return -ENOMEM;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b701f7fe07..b414736d59 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -342,14 +342,24 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
+
+static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
+{
+#ifdef __KERNEL__
+ prt_printf(out, "%pg", bdev);
+#else
+ prt_str(out, bdev->name);
+#endif
+}
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
@@ -374,8 +384,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
- u64 max_duration;
u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
@@ -390,15 +401,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ if (v != !!*start) {
+ if (!v) {
+ bch2_time_stats_update(stats, *start);
+ *start = 0;
+ } else {
+ *start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ bool ret = v && !*start;
+ *start = v;
+ return ret;
+}
+#endif
+
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
@@ -831,4 +866,14 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static inline bool qstr_eq(const struct qstr l, const struct qstr r)
+{
+ return l.len == r.len && !memcmp(l.name, r.name, l.len);
+}
+
+void bch2_darray_str_exit(darray_str *);
+int bch2_split_devs(const char *, darray_str *);
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index a6561b4b36..2ad338e282 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -48,14 +48,14 @@
((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_for_each(_s, _i) \
- for (_i = (_s)->start; \
+ for (typeof(&(_s)->start[0]) _i = (_s)->start; \
_i < vstruct_last(_s); \
_i = vstruct_next(_i))
-#define vstruct_for_each_safe(_s, _i, _t) \
- for (_i = (_s)->start; \
- _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
- _i = _t)
+#define vstruct_for_each_safe(_s, _i) \
+ for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \
+ _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
+ _i = _next)
#define vstruct_idx(_s, _idx) \
((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5a1858fb98..9c0d231603 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,8 +590,9 @@ err:
mutex_unlock(&inode->ei_update_lock);
if (value &&
- (opt_id == Opt_background_compression ||
- opt_id == Opt_background_target))
+ (opt_id == Opt_background_target ||
+ opt_id == Opt_background_compression ||
+ (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
new file mode 100644
index 0000000000..e9f8105395
--- /dev/null
+++ b/fs/bcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */