diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 21:00:51 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-19 21:00:51 +0000 |
commit | 6d03a247468059b0e59c821ef39e6762d4d6fc30 (patch) | |
tree | 17b9c00de2c62e68c965c742cdbc206f77a375da /fs/bcachefs/btree_update_interior.c | |
parent | Releasing progress-linux version 6.8.12-1~progress7.99u1. (diff) | |
download | linux-6d03a247468059b0e59c821ef39e6762d4d6fc30.tar.xz linux-6d03a247468059b0e59c821ef39e6762d4d6fc30.zip |
Merging upstream version 6.9.2.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/bcachefs/btree_update_interior.c')
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 498 |
1 files changed, 343 insertions, 155 deletions
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4530b14ff2..b4efd8cc4d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,15 +19,23 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery_passes.h" #include "replicas.h" +#include "sb-members.h" #include "super-io.h" #include "trace.h" #include <linux/random.h> +static const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BTREE_UPDATE_MODES() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, @@ -45,56 +54,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); + } + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; } -#endif +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -255,7 +311,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; @@ -550,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as, bch2_keylist_push(keys); } +static bool btree_update_new_nodes_marked_sb(struct btree_update *as) +{ + for_each_keylist_key(&as->new_keys, k) + if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) + return false; + return true; +} + +static void btree_update_new_nodes_mark_sb(struct btree_update *as) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* * The transactional part of an interior btree node update, where we journal the * update we did to the interior node and update alloc info: @@ -607,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; + if (!btree_update_new_nodes_marked_sb(as)) + btree_update_new_nodes_mark_sb(as); + /* * Wait for any in flight writes to finish before we free the old nodes * on disk: @@ -639,7 +718,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -647,11 +726,15 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_trans_unlock(trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, - "%s(): error %s", __func__, bch2_err_str(ret)); + "%s", bch2_err_str(ret)); err: - if (as->b) { - - b = as->b; + /* + * We have to be careful because another thread might be getting ready + * to free as->b and calling btree_update_reparent() on us - we'll + * recheck under btree_update_lock below: + */ + b = READ_ONCE(as->b); + if (b) { btree_path_idx_t path_idx = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; @@ -795,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); + BUG_ON(as->update_level_end < b->c.level); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_UPDATE_node; as->b = b; + as->update_level_end = b->c.level; set_btree_node_write_blocked(b); list_add(&as->write_blocked_list, &b->write_blocked); @@ -825,7 +910,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -836,7 +921,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -850,7 +935,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1028,7 +1113,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1045,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level, bool split, unsigned flags) + unsigned level_start, bool split, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1053,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; - unsigned update_level = level; + unsigned level_end = level_start; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret = 0; u32 restart_count = trans->restart_count; @@ -1068,29 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark < c->journal.watermark) { - struct journal_res res = { 0 }; + if (watermark < BCH_WATERMARK_reclaim && + test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { + if (flags & BCH_TRANS_COMMIT_journal_reclaim) + return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); - ret = drop_locks_do(trans, - bch2_journal_res_get(&c->journal, &res, 1, - watermark|JOURNAL_RES_GET_CHECK)); + bch2_trans_unlock(trans); + wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); + ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } while (1) { - nr_nodes[!!update_level] += 1 + split; - update_level++; + nr_nodes[!!level_end] += 1 + split; + level_end++; - ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + ret = bch2_btree_path_upgrade(trans, path, level_end + 1); if (ret) return ERR_PTR(ret); - if (!btree_path_node(path, update_level)) { + if (!btree_path_node(path, level_end)) { /* Allocating new root? */ nr_nodes[1] += split; - update_level = BTREE_MAX_DEPTH; + level_end = BTREE_MAX_DEPTH; break; } @@ -1098,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[level_end].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; - split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (!down_read_trylock(&c->gc_lock)) { @@ -1116,12 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); - as->c = c; - as->start_time = start_time; - as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = true; - as->btree_id = path->btree_id; - as->update_level = update_level; + as->c = c; + as->start_time = start_time; + as->ip_started = _RET_IP_; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; + as->took_gc_lock = true; + as->btree_id = path->btree_id; + as->update_level_start = level_start; + as->update_level_end = level_end; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -1163,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } @@ -1193,7 +1282,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && - !bch2_err_matches(ret, EROFS)) + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_reclaim_would_deadlock) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1208,33 +1298,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); bch2_recalc_btree_reserve(c); } -static void bch2_btree_set_root(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static int bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + bool nofail) { struct bch_fs *c = as->c; - struct btree *old; trace_and_count(c, btree_node_set_root, trans, b); - old = btree_node_root(c, b); + struct btree *old = btree_node_root(c, b); /* * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write_nofail(trans, path, &old->c); + if (nofail) { + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + } else { + int ret = bch2_btree_node_lock_write(trans, path, &old->c); + if (ret) + return ret; + } bch2_btree_set_root_inmem(c, b); @@ -1248,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as, * depend on the new root would have to update the new root. */ bch2_btree_node_unlock_write(trans, path, old); + return 0; } /* Interior node updates: */ @@ -1314,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter node_iter, - struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1378,9 +1471,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); @@ -1439,8 +1539,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1469,15 +1568,15 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1486,9 +1585,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1578,16 +1682,17 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); - if (ret) - goto err; + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); } else if (n3) { - bch2_btree_set_root(as, trans, trans->paths + path, n3); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, trans->paths + path, n1); + ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); } + if (ret) + goto err; + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1644,27 +1749,6 @@ err: goto out; } -static void -bch2_btree_insert_keys_interior(struct btree_update *as, - struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct keylist *keys) -{ - struct btree_path *linked; - unsigned i; - - __bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - - btree_update_updated_node(as, b); - - trans_for_each_path_with_node(trans, b, linked, i) - bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - - bch2_trans_verify_paths(trans); -} - /** * bch2_btree_insert_node - insert bkeys into a given btree node * @@ -1673,7 +1757,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1683,10 +1766,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; - struct btree_path *path = trans->paths + path_idx; + struct btree_path *path = trans->paths + path_idx, *linked; + unsigned i; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1709,9 +1793,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } + + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + trans_for_each_path_with_node(trans, b, linked, i) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - bch2_btree_insert_keys_interior(as, trans, path, b, keys); + bch2_trans_verify_paths(trans); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1725,21 +1819,22 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_maybe_compact_whiteouts(c, b)) bch2_trans_node_reinit_iter(trans, b); + btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) { + if (b->c.level >= as->update_level_end) { trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1747,7 +1842,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1759,7 +1853,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1775,6 +1869,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + int ret = bch2_btree_set_root(as, trans, path, n, true); + BUG_ON(ret); + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, @@ -1797,6 +1950,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); + /* + * Work around a deadlock caused by the btree write buffer not doing + * merges and leaving tons of merges for us to do - we really don't need + * to be doing merges at all from the interior update path, and if the + * interior update path is generating too many new interior updates we + * deadlock: + */ + if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) + return 0; + + if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { + flags &= ~BCH_WATERMARK_MASK; + flags |= BCH_WATERMARK_btree; + flags |= BCH_TRANS_COMMIT_journal_reclaim; + } + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || @@ -1845,8 +2014,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } @@ -1916,7 +2084,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; @@ -1943,6 +2111,10 @@ err: bch2_path_put(trans, new_path, true); bch2_path_put(trans, sib_path, true); bch2_trans_verify_locks(trans); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + ret = 0; + if (!ret) + ret = bch2_trans_relock(trans); return ret; err_free_update: bch2_btree_node_free_never_used(as, trans, n); @@ -1987,14 +2159,14 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); - if (ret) - goto err; + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); } else { - bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); + ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); } + if (ret) + goto err; + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -2069,7 +2241,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - bch_err_fn(c, ret); + bch_err_fn_ratelimited(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2116,7 +2288,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); } - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } void bch2_do_pending_node_rewrites(struct bch_fs *c) @@ -2128,7 +2300,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) list_del(&a->list); bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); - queue_work(c->btree_interior_update_worker, &a->work); + queue_work(c->btree_node_rewrite_worker, &a->work); } mutex_unlock(&c->pending_node_rewrites_lock); } @@ -2339,7 +2511,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2358,7 +2530,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2385,9 +2557,23 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); +} + +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + bch2_btree_id_str(as->btree_id), + as->update_level_start, + as->update_level_end, + bch2_watermarks[as->watermark], + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2396,12 +2582,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%p m %u w %u r %u j %llu\n", - as, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } @@ -2465,6 +2646,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + if (c->btree_node_rewrite_worker) + destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) destroy_workqueue(c->btree_interior_update_worker); mempool_exit(&c->btree_interior_update_pool); @@ -2485,10 +2668,15 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + c->btree_node_rewrite_worker = + alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); + if (!c->btree_node_rewrite_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; |