diff options
Diffstat (limited to 'fs/xfs/scrub/refcount_repair.c')
-rw-r--r-- | fs/xfs/scrub/refcount_repair.c | 794 |
1 files changed, 794 insertions, 0 deletions
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 0000000000..f38fccc42a --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +/* The only parts of the rmap that we care about for computing refcounts. */ +struct xrep_refc_rmap { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +} __packed; + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xrep_refc_rmap *rrm, + bool *have_rec) +{ + struct xfs_rmap_irec rmap; + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) + return -EFSCORRUPTED; + + if (rmap.rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap.rm_startblock, + rmap.rm_blockcount); + if (error) + return error; + } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap.rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap.rm_startblock, rmap.rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, &rmap)); + + rrm->startblock = rmap.rm_startblock; + rrm->blockcount = rmap.rm_blockcount; + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +#define RRM_NEXT(r) ((r).startblock + (r).blockcount) +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +static inline int +xrep_refc_next_edge( + struct xfarray *rmap_bag, + struct xrep_refc_rmap *next_rrm, + bool next_valid, + xfs_agblock_t *nbnop) +{ + struct xrep_refc_rmap rrm; + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t nbno = NULLAGBLOCK; + int error; + + if (next_valid) + nbno = next_rrm->startblock; + + while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) + nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); + + if (error) + return error; + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (nbno == NULLAGBLOCK) + return -EFSCORRUPTED; + + *nbnop = nbno; + return 0; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct xfarray *rmap_bag, + xfs_agblock_t bno, + struct xrep_refc_rmap *rrm, + bool *have, + uint64_t *stack_sz) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rrm->startblock == bno) { + error = xfarray_store_anywhere(rmap_bag, rrm); + if (error) + return error; + (*stack_sz)++; + error = xrep_refc_walk_rmaps(rr, rrm, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + return -EFSCORRUPTED; + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xrep_refc_rmap rrm; + struct xfs_scrub *sc = rr->sc; + struct xfarray *rmap_bag; + char *descr; + uint64_t old_stack_sz; + uint64_t stack_sz = 0; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a sparse array to store all the rmap records that we're + * tracking to generate a reference count record. If this exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), + &rmap_bag); + kfree(descr); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rrm.startblock; + error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, + &rrm, &have, &stack_sz); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_sz = stack_sz; + + /* While stack isn't empty... */ + while (stack_sz) { + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + + /* Pop all rmaps that end at nbno */ + while ((error = xfarray_iter(rmap_bag, &array_cur, + &rrm)) == 1) { + if (RRM_NEXT(rrm) != nbno) + continue; + error = xfarray_unset(rmap_bag, array_cur - 1); + if (error) + goto out_bag; + stack_sz--; + } + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rmap_bag, + nbno, &rrm, &have, &stack_sz); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (stack_sz != old_stack_sz) { + if (old_stack_sz > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_sz); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (stack_sz == 0) + break; + old_stack_sz = stack_sz; + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(stack_sz == 0); +out_bag: + xfarray_destroy(rmap_bag); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} +#undef RRM_NEXT + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, + pag); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} |