diff options
Diffstat (limited to '')
-rw-r--r-- | fs/bcachefs/rebalance.c | 473 |
1 files changed, 473 insertions, 0 deletions
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 index 0000000000..dd6fed2581 --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "clock.h" +#include "compress.h" +#include "disk_groups.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "move.h" +#include "rebalance.h" +#include "subvolume.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/sched/cputime.h> + +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + +static const char * const bch2_rebalance_state_strs[] = { +#define x(t) #t, + BCH_REBALANCE_STATES() + NULL +#undef x +}; + +static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie *cookie; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + ret = PTR_ERR_OR_ZERO(cookie); + if (ret) + goto err; + + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter.pos; + cookie->v.cookie = cpu_to_le64(v + 1); + + ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) +{ + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + __bch2_set_rebalance_needs_scan(trans, inum)); + rebalance_wakeup(c); + return ret; +} + +int bch2_set_fs_needs_rebalance(struct bch_fs *c) +{ + return bch2_set_rebalance_needs_scan(c, 0); +} + +static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + if (v == cookie) + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + struct btree_iter *work_iter) +{ + return !kthread_should_stop() + ? bch2_btree_iter_peek(work_iter) + : bkey_s_c_null; +} + +static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + extent_entry_drop(bkey_i_to_s(n), + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); + return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + +static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct bpos work_pos, + struct btree_iter *extent_iter, + struct data_update_opts *data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + + const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; + if (!r) { + /* raced due to btree write buffer, nothing to do */ + return bkey_s_c_null; + } + + memset(data_opts, 0, sizeof(*data_opts)); + + data_opts->rewrite_ptrs = + bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); + data_opts->target = r->target; + + if (!data_opts->rewrite_ptrs) { + /* + * device we would want to write to offline? devices in target + * changed? + * + * We'll now need a full scan before this extent is picked up + * again: + */ + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); + return bkey_s_c_null; + } + + return k; +} + +noinline_for_stack +static int do_rebalance_extent(struct moving_context *ctxt, + struct bpos work_pos, + struct btree_iter *extent_iter) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &trans->c->rebalance; + struct data_update_opts data_opts; + struct bch_io_opts io_opts; + struct bkey_s_c k; + struct bkey_buf sk; + int ret; + + ctxt->stats = &r->work_stats; + r->state = BCH_REBALANCE_working; + + bch2_bkey_buf_init(&sk); + + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, + extent_iter, &data_opts)); + if (ret || !k.k) + goto out; + + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) + goto out; + + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + if (ret) { + if (bch2_err_matches(ret, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + ret = -BCH_ERR_transaction_restart_nested; + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto out; + + /* skip it and continue, XXX signal failure */ + ret = 0; + } +out: + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned target, compression; + + if (k.k->p.inode) { + target = io_opts->background_target; + compression = io_opts->background_compression ?: io_opts->compression; + } else { + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + target = r ? r->target : io_opts->background_target; + compression = r ? r->compression : + (io_opts->background_compression ?: io_opts->compression); + } + + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); + data_opts->target = target; + return data_opts->rewrite_ptrs != 0; +} + +static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs_rebalance *r = &trans->c->rebalance; + int ret; + + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + ctxt->stats = &r->scan_stats; + + if (!inum) { + r->scan_start = BBPOS_MIN; + r->scan_end = BBPOS_MAX; + } else { + r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); + r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } + + r->state = BCH_REBALANCE_scanning; + + ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + bch2_move_stats_exit(&r->scan_stats, trans->c); + return ret; +} + +static void rebalance_wait(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + u64 now = atomic64_read(&clock->now); + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + r->wait_iotime_end = now + (min_member_capacity >> 6); + + if (r->state != BCH_REBALANCE_waiting) { + r->wait_iotime_start = now; + r->wait_wallclock_start = ktime_get_real_ns(); + r->state = BCH_REBALANCE_waiting; + } + + bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} + +static int do_rebalance(struct moving_context *ctxt) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; + + bch2_move_stats_init(&r->work_stats, "rebalance_work"); + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + + bch2_trans_iter_init(trans, &rebalance_work_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS); + + while (!bch2_move_ratelimit(ctxt)) { + if (!r->enabled) { + bch2_moving_ctxt_flush_all(ctxt); + kthread_wait_freezable(r->enabled || + kthread_should_stop()); + } + + if (kthread_should_stop()) + break; + + bch2_trans_begin(trans); + + ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret || !k.k) + break; + + ret = k.k->type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k.k->p.inode, + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) + : do_rebalance_extent(ctxt, k.k->p, &extent_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_btree_iter_advance(&rebalance_work_iter); + } + + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_work_iter); + bch2_move_stats_exit(&r->scan_stats, c); + + if (!ret && + !kthread_should_stop() && + !atomic64_read(&r->work_stats.sectors_seen) && + !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_moving_ctxt_flush_all(ctxt); + bch2_trans_unlock_long(trans); + rebalance_wait(c); + } + + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; + struct moving_context ctxt; + int ret; + + set_freezable(); + + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, + writepoint_ptr(&c->rebalance_write_point), + true); + + while (!kthread_should_stop() && + !(ret = do_rebalance(&ctxt))) + ; + + bch2_moving_ctxt_exit(&ctxt); + + return 0; +} + +void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + prt_str(out, bch2_rebalance_state_strs[r->state]); + prt_newline(out); + printbuf_indent_add(out, 2); + + switch (r->state) { + case BCH_REBALANCE_waiting: { + u64 now = atomic64_read(&c->io_clock[WRITE].now); + + prt_str(out, "io wait duration: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + prt_newline(out); + + prt_str(out, "io wait remaining: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + prt_newline(out); + + prt_str(out, "duration waited: "); + bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); + prt_newline(out); + break; + } + case BCH_REBALANCE_working: + bch2_move_stats_to_text(out, &r->work_stats); + break; + case BCH_REBALANCE_scanning: + bch2_move_stats_to_text(out, &r->scan_stats); + break; + } + prt_newline(out); + printbuf_indent_sub(out, 2); +} + +void bch2_rebalance_stop(struct bch_fs *c) +{ + struct task_struct *p; + + c->rebalance.pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance.pd.rate); + + p = rcu_dereference_protected(c->rebalance.thread, 1); + c->rebalance.thread = NULL; + + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); + + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; + int ret; + + if (c->rebalance.thread) + return 0; + + if (c->opts.nochanges) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err_msg(c, ret, "creating rebalance thread"); + return ret; + } + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); + wake_up_process(p); + return 0; +} + +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); +} |