diff options
Diffstat (limited to 'fs/xfs/scrub')
54 files changed, 7575 insertions, 849 deletions
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h index ed08f76ff4..e488e1f4f6 100644 --- a/fs/xfs/scrub/agb_bitmap.h +++ b/fs/xfs/scrub/agb_bitmap.h @@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur); +static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b) +{ + return xbitmap32_count_set_regions(&b->agbitmap); +} + #endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 6c6e5eba42..e954f07679 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -556,28 +556,28 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + agbno = be32_to_cpu(agf->agf_bno_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + agbno = be32_to_cpu(agf->agf_cnt_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + level = be32_to_cpu(agf->agf_bno_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + level = be32_to_cpu(agf->agf_cnt_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (xfs_has_rmapbt(mp)) { - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + agbno = be32_to_cpu(agf->agf_rmap_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + level = be32_to_cpu(agf->agf_rmap_level); if (level <= 0 || level > mp->m_rmap_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 26bd1ff68f..427054b65b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -174,8 +174,7 @@ xrep_agf_find_btrees( * We relied on the rmapbt to reconstruct the AGF. If we get a * different root then something's seriously wrong. */ - if (fab[XREP_AGF_RMAPBT].root != - be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi])) + if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root)) return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ @@ -224,20 +223,14 @@ xrep_agf_set_roots( struct xfs_agf *agf, struct xrep_find_ag_btree *fab) { - agf->agf_roots[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].root); - agf->agf_levels[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].height); + agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root); + agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height); - agf->agf_roots[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].root); - agf->agf_levels[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].height); + agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root); + agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height); - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].root); - agf->agf_levels[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].height); + agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root); + agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = @@ -262,8 +255,7 @@ xrep_agf_calc_from_btrees( int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -276,8 +268,7 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -333,12 +324,9 @@ xrep_agf_commit_new( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); @@ -559,16 +547,14 @@ xrep_agfl_collect_blocks( goto out_bmp; /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) @@ -908,7 +894,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -928,8 +914,7 @@ xrep_agi_calc_from_btrees( if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, - XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 45edda0968..d421b25392 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -687,8 +687,8 @@ xrep_abt_reset_counters( * height values before re-initializing the perag info from the updated * AGF to capture all the new values. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + pag->pagf_repair_bno_level = pag->pagf_bno_level; + pag->pagf_repair_cnt_level = pag->pagf_cnt_level; /* Reinitialize with the values we just logged. */ return xrep_reinit_pagf(sc); @@ -735,10 +735,11 @@ xrep_abt_build_new_trees( ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; /* Allocate cursors for the staged btrees. */ - bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, - pag, XFS_BTNUM_BNO); - cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, - pag, XFS_BTNUM_CNT); + bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); + + cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); /* Last chance to abort before we start committing fixes. */ if (xchk_should_terminate(sc, &error)) @@ -765,10 +766,8 @@ xrep_abt_build_new_trees( * height so that we don't trip the verifiers when writing the new * btree blocks to disk. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = - ra->new_bnobt.bload.btree_height; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = - ra->new_cntbt.bload.btree_height; + pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; + pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; /* Load the free space by length tree. */ ra->array_cur = XFARRAY_CURSOR_INIT; @@ -807,8 +806,8 @@ xrep_abt_build_new_trees( return xrep_roll_ag_trans(sc); err_levels: - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; err_cur: xfs_btree_del_cursor(cnt_cur, error); xfs_btree_del_cursor(bno_cur, error); @@ -838,8 +837,8 @@ xrep_abt_remove_old_trees( * Now that we've zapped all the old allocbt blocks we can turn off * the alternate height mechanism. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; return 0; } diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 1449bb5262..0cb8d43912 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -566,3 +566,17 @@ xbitmap32_test( *len = bn->bn_start - start; return false; } + +/* Count the number of set regions in this bitmap. */ +uint32_t +xbitmap32_count_set_regions( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t nr = 0; + + for_each_xbitmap32_extent(bn, bitmap) + nr++; + + return nr; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 2df8911606..710c1ac5e3 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, bool xbitmap32_empty(struct xbitmap32 *bitmap); bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); +uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap); + #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index b169cddde6..24a15bf784 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -924,7 +924,7 @@ xchk_bmap( if (!ifp) return -ENOENT; - info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.is_rt = xfs_ifork_is_realtime(ip, whichfork); info.whichfork = whichfork; info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); info.sc = sc; diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index a4bb89fdd5..1e656fab5e 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -639,7 +639,13 @@ xrep_bmap_build_new_fork( rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; - bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Allocate a new bmap btree cursor for reloading an inode block mapping + * data structure. + */ + bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK); + xfs_btree_stage_ifakeroot(bmap_cur, ifake); /* * Figure out the size and format of the new fork, then fill it with diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 1935b9ce18..fe678a0438 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -47,7 +47,7 @@ __xchk_btree_process_error( *error = 0; fallthrough; default: - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, *error, ret_ip); else @@ -91,7 +91,7 @@ __xchk_btree_set_corrupt( { sc->sm->sm_flags |= errflag; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_error(sc, cur, level, ret_ip); else @@ -168,7 +168,7 @@ xchk_btree_rec( if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is high_key(rec) no larger than the parent high key? */ @@ -215,7 +215,7 @@ xchk_btree_key( if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is this block's high key no larger than the parent high key? */ @@ -236,22 +236,18 @@ xchk_btree_ptr_ok( int level, union xfs_btree_ptr *ptr) { - bool res; - /* A btree rooted in an inode has no block pointer to the root. */ - if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == bs->cur->bc_nlevels) return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); - else - res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); - if (!res) + if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return false; + } - return res; + return true; } /* Check that a btree block's sibling matches what we expect it. */ @@ -374,18 +370,21 @@ xchk_btree_check_block_owner( { xfs_agnumber_t agno; xfs_agblock_t agbno; - xfs_btnum_t btnum; bool init_sa; int error = 0; if (!bs->cur) return 0; - btnum = bs->cur->bc_btnum; agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); - init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + /* + * If the btree being examined is not itself a per-AG btree, initialize + * sc->sa so that we can check for the presence of an ownership record + * in the rmap btree for the AG containing the block. + */ + init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, @@ -399,11 +398,11 @@ xchk_btree_check_block_owner( * have to nullify it (to shut down further block owner checks) if * self-xref encounters problems. */ - if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops)) bs->cur = NULL; xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); - if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops)) bs->cur = NULL; out_free: @@ -429,7 +428,7 @@ xchk_btree_check_owner( * up. */ if (bp == NULL) { - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE) xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -442,7 +441,7 @@ xchk_btree_check_owner( * duplicate cursors. Therefore, save the buffer daddr for * later scanning. */ - if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) { struct check_owner *co; co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); @@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs( * existing filesystems, so instead we disable the check for data fork * bmap btrees when there's an attr fork. */ - if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + if (xfs_btree_is_bmap(bs->cur->bc_ops) && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && xfs_inode_has_attr_fork(bs->sc->ip)) return false; @@ -508,7 +507,7 @@ xchk_btree_check_minrecs( * child block might be less than the standard minrecs, but that's ok * provided that there's only one direct child of the root. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 2) { struct xfs_btree_block *root_block; struct xfs_buf *root_bp; @@ -562,7 +561,7 @@ xchk_btree_block_check_keys( return; } - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Make sure the high key of this block matches the parent. */ @@ -585,7 +584,6 @@ xchk_btree_get_block( struct xfs_btree_block **pblock, struct xfs_buf **pbp) { - xfs_failaddr_t failed_at; int error; *pblock = NULL; @@ -597,13 +595,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) - failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, - level, *pbp); - else - failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, - level, *pbp); - if (failed_at) { + if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -664,7 +656,7 @@ xchk_btree_block_keys( if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Get high keys */ @@ -728,7 +720,7 @@ xchk_btree( * error codes for us. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) goto out; error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 81f2b96bb5..47a20cf520 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -29,6 +29,8 @@ #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -82,6 +84,15 @@ __xchk_process_error( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -89,8 +100,7 @@ __xchk_process_error( *error = 0; fallthrough; default: - trace_xchk_op_error(sc, agno, bno, *error, - ret_ip); + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); break; } return false; @@ -136,6 +146,16 @@ __xchk_fblock_process_error( /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_file_op_error(sc, whichfork, offset, *error, + ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -227,6 +247,19 @@ xchk_block_set_corrupt( trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } +#ifdef CONFIG_XFS_QUOTA +/* Record a corrupt quota counter. */ +void +xchk_qcheck_set_corrupt( + struct xfs_scrub *sc, + unsigned int dqtype, + xfs_dqid_t id) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_qcheck_error(sc, dqtype, id, __return_address); +} +#endif + /* Record a corruption while cross-referencing. */ void xchk_block_xref_set_corrupt( @@ -427,7 +460,7 @@ xchk_perag_read_headers( * Grab the AG headers for the attached perag structure and wait for pending * intents to drain. */ -static int +int xchk_perag_drain_and_lock( struct xfs_scrub *sc) { @@ -555,46 +588,50 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { + if (sa->agf_bp) { /* Set up a bnobt cursor for cross-referencing. */ - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_BNO); - } + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, + XFS_SCRUB_TYPE_BNOBT); - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_CNT); - } - - /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { - sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_INO); - } - - /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_has_finobt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_FINO); - } - - /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_rmapbt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { - sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, + XFS_SCRUB_TYPE_CNTBT); + + /* Set up a rmapbt cursor for cross-referencing. */ + if (xfs_has_rmapbt(mp)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, + XFS_SCRUB_TYPE_RMAPBT); + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (xfs_has_reflink(mp)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, + XFS_SCRUB_TYPE_REFCNTBT); + } } - /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_reflink(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { - sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, sa->pag); + if (sa->agi_bp) { + /* Set up a inobt cursor for cross-referencing. */ + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, + XFS_SCRUB_TYPE_INOBT); + + /* Set up a finobt cursor for cross-referencing. */ + if (xfs_has_finobt(mp)) { + sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, + XFS_SCRUB_TYPE_FINOBT); + } } } @@ -653,6 +690,13 @@ xchk_trans_cancel( sc->tp = NULL; } +int +xchk_trans_alloc_empty( + struct xfs_scrub *sc) +{ + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -672,7 +716,7 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* Set us up with a transaction and an empty context. */ @@ -1000,9 +1044,7 @@ xchk_irele( struct xfs_scrub *sc, struct xfs_inode *ip) { - if (current->journal_info != NULL) { - ASSERT(current->journal_info == sc->tp); - + if (sc->tp) { /* * If we are in a transaction, we /cannot/ drop the inode * ourselves, because the VFS will trigger writeback, which @@ -1259,6 +1301,15 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_DRAIN) xfs_drain_wait_enable(); + if (scrub_fsgates & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_enable(); + + if (scrub_fsgates & XCHK_FSGATES_RMAP) + xfs_rmap_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index da09580b45..89f7bbec88 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -32,6 +32,7 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc, void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset); +#ifdef CONFIG_XFS_QUOTA +void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype, + xfs_dqid_t id); +#endif void xchk_block_xref_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); @@ -105,6 +110,7 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); +int xchk_setup_quotacheck(struct xfs_scrub *sc); #else static inline int xchk_ino_dqattach(struct xfs_scrub *sc) @@ -116,12 +122,19 @@ xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_setup_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); +int xchk_perag_drain_and_lock(struct xfs_scrub *sc); /* * Grab all AG resources, treating the inability to grab the perag structure as diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 1e82c727af..4de3f0f40f 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -609,6 +609,6 @@ xrep_bmap_cow( out_bitmap: xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); xoff_bitmap_destroy(&xc->bad_fileoffs); - kmem_free(xc); + kfree(xc); return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d86ab51af9..076a310b8e 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -93,11 +93,11 @@ xchk_dir_actor( return -ECANCELED; } - if (!strncmp(".", name->name, name->len)) { + if (xfs_dir2_samename(name, &xfs_name_dot)) { /* If this is "." then check that the inum matches the dir. */ if (ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - } else if (!strncmp("..", name->name, name->len)) { + } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 5799e9a94f..d310737c88 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -22,6 +22,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" +#include "scrub/fscounters.h" /* * FS Summary Counters @@ -48,17 +49,6 @@ * our tolerance for mismatch between expected and actual counter values. */ -struct xchk_fscounters { - struct xfs_scrub *sc; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - uint64_t frextents; - unsigned long long icount_min; - unsigned long long icount_max; - bool frozen; -}; - /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -235,14 +225,19 @@ xchk_setup_fscounters( * Pause all writer activity in the filesystem while we're scrubbing to * reduce the likelihood of background perturbations to the counters * throwing off our calculations. + * + * If we're repairing, we need to prevent any other thread from + * changing the global fs summary counters while we're repairing them. + * This requires the fs to be frozen, which will disable background + * reclaim and purge all inactive inodes. */ - if (sc->flags & XCHK_TRY_HARDER) { + if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { error = xchk_fscounters_freeze(sc); if (error) return error; } - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* @@ -254,7 +249,9 @@ xchk_setup_fscounters( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. */ /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ @@ -482,6 +479,10 @@ xchk_fscount_within_range( if (curr_value == expected) return true; + /* We require exact matches when repair is running. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return false; + min_value = min(old_value, curr_value); max_value = max(old_value, curr_value); diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h new file mode 100644 index 0000000000..461a13d25f --- /dev/null +++ b/fs/xfs/scrub/fscounters.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_FSCOUNTERS_H__ +#define __XFS_SCRUB_FSCOUNTERS_H__ + +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; + bool frozen; +}; + +#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */ diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c new file mode 100644 index 0000000000..94cdb852be --- /dev/null +++ b/fs/xfs/scrub/fscounters_repair.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/fscounters.h" + +/* + * FS Summary Counters + * =================== + * + * We correct errors in the filesystem summary counters by setting them to the + * values computed during the obligatory scrub phase. However, we must be + * careful not to allow any other thread to change the counters while we're + * computing and setting new values. To achieve this, we freeze the + * filesystem for the whole operation if the REPAIR flag is set. The checking + * function is stricter when we've frozen the fs. + */ + +/* + * Reset the superblock counters. Caller is responsible for freezing the + * filesystem during the calculation and reset phases. + */ +int +xrep_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + + /* + * Reinitialize the in-core counters from what we computed. We froze + * the filesystem, so there shouldn't be anyone else trying to modify + * these counters. + */ + if (!fsc->frozen) { + ASSERT(fsc->frozen); + return -EFSCORRUPTED; + } + + trace_xrep_reset_counters(mp, fsc); + + percpu_counter_set(&mp->m_icount, fsc->icount); + percpu_counter_set(&mp->m_ifree, fsc->ifree); + percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + percpu_counter_set(&mp->m_frextents, fsc->frextents); + mp->m_sb.sb_frextents = fsc->frextents; + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 531006910c..9020a6bef7 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -14,6 +14,7 @@ #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" +#include "scrub/common.h" /* * Scrub and In-Core Filesystem Health Assessments @@ -105,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ @@ -148,6 +151,24 @@ xchk_file_looks_zapped( } /* + * Scrub gave the filesystem a clean bill of health, so clear all the indirect + * markers of past problems (at least for the fs and ags) so that we can be + * healthy again. + */ +STATIC void +xchk_mark_all_healthy( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); + xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); + for_each_perag(mp, agno, pag) + xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); +} + +/* * Update filesystem health assessments based on what we found and did. * * If the scrubber finds errors, we mark sick whatever's mentioned in @@ -164,6 +185,18 @@ xchk_update_health( struct xfs_perag *pag; bool bad; + /* + * The HEALTHY scrub type is a request from userspace to clear all the + * indirect flags after a clean scan of the entire filesystem. As such + * there's no sick flag defined for it, so we branch here ahead of the + * mask check. + */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY && + !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xchk_mark_all_healthy(sc->mp); + return; + } + if (!sc->sick_mask) return; @@ -173,7 +206,7 @@ xchk_update_health( case XHG_AG: pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_sick(pag, sc->sick_mask); + xfs_ag_mark_corrupt(pag, sc->sick_mask); else xfs_ag_mark_healthy(pag, sc->sick_mask); xfs_perag_put(pag); @@ -181,20 +214,30 @@ xchk_update_health( case XHG_INO: if (!sc->ip) return; - if (bad) - xfs_inode_mark_sick(sc->ip, sc->sick_mask); - else + if (bad) { + unsigned int mask = sc->sick_mask; + + /* + * If we're coming in for repairs then we don't want + * sickness flags to propagate to the incore health + * status if the inode gets inactivated before we can + * fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + xfs_inode_mark_corrupt(sc->ip, mask); + } else xfs_inode_mark_healthy(sc->ip, sc->sick_mask); break; case XHG_FS: if (bad) - xfs_fs_mark_sick(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); else xfs_fs_mark_healthy(sc->mp, sc->sick_mask); break; case XHG_RT: if (bad) - xfs_rt_mark_sick(sc->mp, sc->sick_mask); + xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); else xfs_rt_mark_healthy(sc->mp, sc->sick_mask); break; @@ -205,13 +248,13 @@ xchk_update_health( } /* Is the given per-AG btree healthy enough for scanning? */ -bool -xchk_ag_btree_healthy_enough( +void +xchk_ag_btree_del_cursor_if_sick( struct xfs_scrub *sc, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_btree_cur **curp, + unsigned int sm_type) { - unsigned int mask = 0; + unsigned int mask = (*curp)->bc_ops->sick_mask; /* * We always want the cursor if it's the same type as whatever we're @@ -220,41 +263,8 @@ xchk_ag_btree_healthy_enough( * Otherwise, we're only interested in the btree for cross-referencing. * If we know the btree is bad then don't bother, just set XFAIL. */ - switch (btnum) { - case XFS_BTNUM_BNO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) - return true; - mask = XFS_SICK_AG_BNOBT; - break; - case XFS_BTNUM_CNT: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) - return true; - mask = XFS_SICK_AG_CNTBT; - break; - case XFS_BTNUM_INO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) - return true; - mask = XFS_SICK_AG_INOBT; - break; - case XFS_BTNUM_FINO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - return true; - mask = XFS_SICK_AG_FINOBT; - break; - case XFS_BTNUM_RMAP: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) - return true; - mask = XFS_SICK_AG_RMAPBT; - break; - case XFS_BTNUM_REFC: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) - return true; - mask = XFS_SICK_AG_REFCNTBT; - break; - default: - ASSERT(0); - return true; - } + if (sc->sm->sm_type == sm_type) + return; /* * If we just repaired some AG metadata, sc->sick_mask will reflect all @@ -266,10 +276,42 @@ xchk_ag_btree_healthy_enough( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness(pag, mask)) { + if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; - return false; + xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); + *curp = NULL; + } +} + +/* + * Quick scan to double-check that there isn't any evidence of lingering + * primary health problems. If we're still clear, then the health update will + * take care of clearing the indirect evidence. + */ +int +xchk_health_record( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + unsigned int sick; + unsigned int checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_FS_PRIMARY) + xchk_set_corrupt(sc); + + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_RT_PRIMARY) + xchk_set_corrupt(sc); + + for_each_perag(mp, agno, pag) { + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick & XFS_SICK_AG_PRIMARY) + xchk_set_corrupt(sc); } - return true; + return 0; } diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index a731b24673..63fc426eb5 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -8,9 +8,10 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); -bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, - xfs_btnum_t btnum); +void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc, + struct xfs_btree_cur **curp, unsigned int sm_type); void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); +int xchk_health_record(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index a720fc6226..750d7b0cd2 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -76,7 +76,7 @@ xchk_inobt_xref_finobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + ASSERT(xfs_btree_is_fino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -179,7 +179,7 @@ xchk_finobt_xref_inobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment( * Otherwise, we expect that the finobt record is aligned to the * cluster alignment as told by the superblock. */ - if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + if (xfs_btree_is_fino(bs->cur->bc_ops)) { unsigned int imask; imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, @@ -649,8 +649,7 @@ out: */ STATIC void xchk_iallocbt_xref_rmap_btreeblks( - struct xfs_scrub *sc, - int which) + struct xfs_scrub *sc) { xfs_filblks_t blocks; xfs_extlen_t inobt_blocks = 0; @@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks( STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, - int which, unsigned long long inodes) { xfs_filblks_t blocks; @@ -719,17 +717,14 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; - xfs_btnum_t which; int error; switch (sc->sm->sm_type) { case XFS_SCRUB_TYPE_INOBT: cur = sc->sa.ino_cur; - which = XFS_BTNUM_INO; break; case XFS_SCRUB_TYPE_FINOBT: cur = sc->sa.fino_cur; - which = XFS_BTNUM_FINO; break; default: ASSERT(0); @@ -741,7 +736,7 @@ xchk_iallocbt( if (error) return error; - xchk_iallocbt_xref_rmap_btreeblks(sc, which); + xchk_iallocbt_xref_rmap_btreeblks(sc); /* * If we're scrubbing the inode btree, inode_blocks is the number of @@ -750,9 +745,8 @@ xchk_iallocbt( * knows about. We can't do this for the finobt since it only points * to inode chunks with free inodes. */ - if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); - + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes); return error; } diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index b3f7182dd2..a00ec7ae17 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -369,7 +369,7 @@ xrep_ibt_check_inode_ext( * On a sparse inode fs, this cluster could be part of a sparse chunk. * Sparse clusters must be aligned to sparse chunk alignment. */ - if (xfs_has_sparseinodes(mp) && + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) return -EFSCORRUPTED; @@ -663,8 +663,8 @@ xrep_ibt_build_new_trees( ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; - ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, - XFS_BTNUM_INO); + ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, xfarray_length(ri->inode_records)); if (error) @@ -684,8 +684,8 @@ xrep_ibt_build_new_trees( ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; - fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, - &ri->new_finobt.afake, XFS_BTNUM_FINO); + fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); + xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); error = xfs_btree_bload_compute_geometry(fino_cur, &ri->new_finobt.bload, ri->finobt_recs); if (error) diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 0ca62d59f8..eab380e95e 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -37,12 +37,15 @@ #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" #include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" /* * Inode Record Repair @@ -126,6 +129,10 @@ struct xrep_inode { /* Must we remove all access from this file? */ bool zap_acls; + + /* Inode scanner to see if we can find the ftype from dirents */ + struct xchk_iscan ftype_iscan; + uint8_t alleged_ftype; }; /* @@ -227,26 +234,233 @@ xrep_dinode_header( dip->di_gen = cpu_to_be32(sc->sm->sm_gen); } -/* Turn di_mode into /something/ recognizable. */ -STATIC void +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_dinode_findmode_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (ino != sc->sm->sm_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Uhoh, more than one parent for this inode and they don't agree on + * the file type? + */ + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && + ri->alleged_ftype != name->type) { + trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, + ri->alleged_ftype); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember the ftype. */ + trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); + ri->alleged_ftype = name->type; + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_dinode_findmode_walk_directory( + struct xrep_inode *ri, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = ri->sc; + unsigned int lock_mode; + int error = 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Try to find the mode of the inode being repaired by looking for directories + * that point down to this file. + */ +STATIC int +xrep_dinode_find_mode( + struct xrep_inode *ri, + uint16_t *mode) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_inode *dp; + int error; + + /* No ftype means we have no other metadata to consult. */ + if (!xfs_has_ftype(sc->mp)) { + *mode = S_IFREG; + return 0; + } + + /* + * Scan all directories for parents that might point down to this + * inode. Skip the inode being repaired during the scan since it + * cannot be its own parent. Note that we still hold the AGI locked + * so there's a real possibility that _iscan_iter can return EBUSY. + */ + xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + ri->ftype_iscan.skip_ino = sc->sm->sm_ino; + ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; + while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { + if (S_ISDIR(VFS_I(dp)->i_mode)) + error = xrep_dinode_findmode_walk_directory(ri, dp); + xchk_iscan_mark_visited(&ri->ftype_iscan, dp); + xchk_irele(sc, dp); + if (error < 0) + break; + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&ri->ftype_iscan); + xchk_iscan_teardown(&ri->ftype_iscan); + + if (error == -EBUSY) { + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { + /* + * If we got an EBUSY after finding at least one + * dirent, that means the scan found an inode on the + * inactivation list and could not open it. Accept the + * alleged ftype and install a new mode below. + */ + error = 0; + } else if (!(sc->flags & XCHK_TRY_HARDER)) { + /* + * Otherwise, retry the operation one time to see if + * the reason for the delay is an inode from the same + * cluster buffer waiting on the inactivation list. + */ + error = -EDEADLOCK; + } + } + if (error) + return error; + + /* + * Convert the discovered ftype into the file mode. If all else fails, + * return S_IFREG. + */ + switch (ri->alleged_ftype) { + case XFS_DIR3_FT_DIR: + *mode = S_IFDIR; + break; + case XFS_DIR3_FT_WHT: + case XFS_DIR3_FT_CHRDEV: + *mode = S_IFCHR; + break; + case XFS_DIR3_FT_BLKDEV: + *mode = S_IFBLK; + break; + case XFS_DIR3_FT_FIFO: + *mode = S_IFIFO; + break; + case XFS_DIR3_FT_SOCK: + *mode = S_IFSOCK; + break; + case XFS_DIR3_FT_SYMLINK: + *mode = S_IFLNK; + break; + default: + *mode = S_IFREG; + break; + } + return 0; +} + +/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ +STATIC int xrep_dinode_mode( struct xrep_inode *ri, struct xfs_dinode *dip) { struct xfs_scrub *sc = ri->sc; uint16_t mode = be16_to_cpu(dip->di_mode); + int error; trace_xrep_dinode_mode(sc, dip); if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) - return; + return 0; + + /* Try to fix the mode. If we cannot, then leave everything alone. */ + error = xrep_dinode_find_mode(ri, &mode); + switch (error) { + case -EINTR: + case -EBUSY: + case -EDEADLOCK: + /* temporary failure or fatal signal */ + return error; + case 0: + /* found mode */ + break; + default: + /* some other error, assume S_IFREG */ + mode = S_IFREG; + break; + } /* bad mode, so we set it to a file that only root can read */ - mode = S_IFREG; dip->di_mode = cpu_to_be16(mode); dip->di_uid = 0; dip->di_gid = 0; ri->zap_acls = true; + return 0; } /* Fix any conflicting flags that the verifiers complain about. */ @@ -1107,12 +1321,15 @@ xrep_dinode_core( /* Fix everything the verifier will complain about. */ dip = xfs_buf_offset(bp, ri->imap.im_boffset); xrep_dinode_header(sc, dip); - xrep_dinode_mode(ri, dip); + iget_error = xrep_dinode_mode(ri, dip); + if (iget_error) + goto write; xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); xrep_dinode_zap_forks(ri, dip); +write: /* Write out the inode. */ trace_xrep_dinode_fixed(sc, dip); xfs_dinode_calc_crc(sc->mp, dip); @@ -1128,7 +1345,8 @@ xrep_dinode_core( * accessing the inode. If iget fails, we still need to commit the * changes. */ - iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + iget_error = xchk_iget(sc, ino, &sc->ip); if (!iget_error) xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1496,6 +1714,13 @@ xrep_inode( ASSERT(ri != NULL); error = xrep_dinode_problems(ri); + if (error == -EBUSY) { + /* + * Directory scan to recover inode mode encountered a + * busy inode, so we did not continue repairing things. + */ + return 0; + } if (error) return error; diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c new file mode 100644 index 0000000000..ec3478bc50 --- /dev/null +++ b/fs/xfs/scrub/iscan.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/iscan.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Live File Scan + * ============== + * + * Live file scans walk every inode in a live filesystem. This is more or + * less like a regular iwalk, except that when we're advancing the scan cursor, + * we must ensure that inodes cannot be added or deleted anywhere between the + * old cursor value and the new cursor value. If we're advancing the cursor + * by one inode, the caller must hold that inode; if we're finding the next + * inode to scan, we must grab the AGI and hold it until we've updated the + * scan cursor. + * + * Callers are expected to use this code to scan all files in the filesystem to + * construct a new metadata index of some kind. The scan races against other + * live updates, which means there must be a provision to update the new index + * when updates are made to inodes that already been scanned. The iscan lock + * can be used in live update hook code to stop the scan and protect this data + * structure. + * + * To keep the new index up to date with other metadata updates being made to + * the live filesystem, it is assumed that the caller will add hooks as needed + * to be notified when a metadata update occurs. The inode scanner must tell + * the hook code when an inode has been visited with xchk_iscan_mark_visit. + * Hook functions can use xchk_iscan_want_live_update to decide if the + * scanner's observations must be updated. + */ + +/* + * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so + * that the scan ignores that inode. + */ +STATIC void +xchk_iscan_mask_skipino( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_inobt_rec_incore *rec, + xfs_agino_t lastrecino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); + xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); + + if (pag->pag_agno != skip_agno) + return; + if (skip_agino < rec->ir_startino) + return; + if (skip_agino > lastrecino) + return; + + rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); +} + +/* + * Set *cursor to the next allocated inode after whatever it's set to now. + * If there are no more inodes in this AG, cursor is set to NULLAGINO. + */ +STATIC int +xchk_iscan_find_next( + struct xchk_iscan *iscan, + struct xfs_buf *agi_bp, + struct xfs_perag *pag, + xfs_inofree_t *allocmaskp, + xfs_agino_t *cursor, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + xfs_agnumber_t agno = pag->pag_agno; + xfs_agino_t lastino = NULLAGINO; + xfs_agino_t first, last; + xfs_agino_t agino = *cursor; + int has_rec; + int error; + + /* If the cursor is beyond the end of this AG, move to the next one. */ + xfs_agino_range(mp, agno, &first, &last); + if (agino > last) { + *cursor = NULLAGINO; + return 0; + } + + /* + * Look up the inode chunk for the current cursor position. If there + * is no chunk here, we want the next one. + */ + cur = xfs_inobt_init_cursor(pag, tp, agi_bp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); + if (!error && !has_rec) + error = xfs_btree_increment(cur, 0, &has_rec); + for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { + xfs_inofree_t allocmask; + + /* + * If we've run out of inobt records in this AG, move the + * cursor on to the next AG and exit. The caller can try + * again with the next AG. + */ + if (!has_rec) { + *cursor = NULLAGINO; + break; + } + + error = xfs_inobt_get_rec(cur, &rec, &has_rec); + if (error) + break; + if (!has_rec) { + error = -EFSCORRUPTED; + break; + } + + /* Make sure that we always move forward. */ + if (lastino != NULLAGINO && + XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { + error = -EFSCORRUPTED; + break; + } + lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; + + /* + * If this record only covers inodes that come before the + * cursor, advance to the next record. + */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + continue; + + if (iscan->skip_ino) + xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); + + /* + * If the incoming lookup put us in the middle of an inobt + * record, mark it and the previous inodes "free" so that the + * search for allocated inodes will start at the cursor. + * We don't care about ir_freecount here. + */ + if (agino >= rec.ir_startino) + rec.ir_free |= xfs_inobt_maskn(0, + agino + 1 - rec.ir_startino); + + /* + * If there are allocated inodes in this chunk, find them + * and update the scan cursor. + */ + allocmask = ~rec.ir_free; + if (hweight64(allocmask) > 0) { + int next = xfs_lowbit64(allocmask); + + ASSERT(next >= 0); + *cursor = rec.ir_startino + next; + *allocmaskp = allocmask >> next; + *nr_inodesp = XFS_INODES_PER_CHUNK - next; + break; + } + } + + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Advance both the scan and the visited cursors. + * + * The inumber address space for a given filesystem is sparse, which means that + * the scan cursor can jump a long ways in a single iter() call. There are no + * inodes in these sparse areas, so we must move the visited cursor forward at + * the same time so that the scan user can receive live updates for inodes that + * may get created once we release the AGI buffer. + */ +static inline void +xchk_iscan_move_cursor( + struct xchk_iscan *iscan, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t cursor, visited; + + BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); + + /* + * Special-case ino == 0 here so that we never set visited_ino to + * NULLFSINO when wrapping around EOFS, for that will let through all + * live updates. + */ + cursor = XFS_AGINO_TO_INO(mp, agno, agino); + if (cursor == 0) + visited = XFS_MAXINUMBER; + else + visited = cursor - 1; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = cursor; + iscan->__visited_ino = visited; + trace_xchk_iscan_move_cursor(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Prepare to return agno/agino to the iscan caller by moving the lastino + * cursor to the previous inode. Do this while we still hold the AGI so that + * no other threads can create or delete inodes in this AG. + */ +static inline void +xchk_iscan_finish( + struct xchk_iscan *iscan) +{ + mutex_lock(&iscan->lock); + iscan->cursor_ino = NULLFSINO; + + /* All live updates will be applied from now on */ + iscan->__visited_ino = NULLFSINO; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance ino to the next inode that the inobt thinks is allocated, being + * careful to jump to the next AG if we've reached the right end of this AG's + * inode btree. Advancing ino effectively means that we've pushed the inode + * scan forward, so set the iscan cursor to (ino - 1) so that our live update + * predicates will track inode allocations in that part of the inode number + * key space once we release the AGI buffer. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, or the usual negative errno. + */ +STATIC int +xchk_iscan_advance( + struct xchk_iscan *iscan, + struct xfs_perag **pagp, + struct xfs_buf **agi_bpp, + xfs_inofree_t *allocmaskp, + uint8_t *nr_inodesp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + xfs_agino_t agino; + int ret; + + ASSERT(iscan->cursor_ino >= iscan->__visited_ino); + + do { + if (xchk_iscan_aborted(iscan)) + return -ECANCELED; + + agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ECANCELED; + + ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + if (ret) + goto out_pag; + + agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, + &agino, nr_inodesp); + if (ret) + goto out_buf; + + if (agino != NULLAGINO) { + /* + * Found the next inode in this AG, so return it along + * with the AGI buffer and the perag structure to + * ensure it cannot go away. + */ + xchk_iscan_move_cursor(iscan, agno, agino); + *agi_bpp = agi_bp; + *pagp = pag; + return 1; + } + + /* + * Did not find any more inodes in this AG, move on to the next + * AG. + */ + agno = (agno + 1) % mp->m_sb.sb_agcount; + xchk_iscan_move_cursor(iscan, agno, 0); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_advance_ag(iscan); + } while (iscan->cursor_ino != iscan->scan_start_ino); + + xchk_iscan_finish(iscan); + return 0; + +out_buf: + xfs_trans_brelse(sc->tp, agi_bp); +out_pag: + xfs_perag_put(pag); + return ret; +} + +/* + * Grabbing the inode failed, so we need to back up the scan and ask the caller + * to try to _advance the scan again. Returns -EBUSY if we've run out of retry + * opportunities, -ECANCELED if the process has a fatal signal pending, or + * -EAGAIN if we should try again. + */ +STATIC int +xchk_iscan_iget_retry( + struct xchk_iscan *iscan, + bool wait) +{ + ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); + + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + if (wait) { + unsigned long relax; + + /* + * Sleep for a period of time to let the rest of the system + * catch up. If we return early, someone sent a kill signal to + * the calling process. + */ + relax = msecs_to_jiffies(iscan->iget_retry_delay); + trace_xchk_iscan_iget_retry_wait(iscan); + + if (schedule_timeout_killable(relax) || + xchk_iscan_aborted(iscan)) + return -ECANCELED; + } + + iscan->cursor_ino--; + return -EAGAIN; +} + +/* + * Grab an inode as part of an inode scan. While scanning this inode, the + * caller must ensure that no other threads can modify the inode until a call + * to xchk_iscan_visit succeeds. + * + * Returns the number of incore inodes grabbed; -EAGAIN if the caller should + * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; + * -ECANCELED if there's a fatal signal pending; or some other negative errno. + */ +STATIC int +xchk_iscan_iget( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf *agi_bp, + xfs_inofree_t allocmask, + uint8_t nr_inodes) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t ino = iscan->cursor_ino; + unsigned int idx = 0; + unsigned int i; + int error; + + ASSERT(iscan->__inodes[0] == NULL); + + /* Fill the first slot in the inode array. */ + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + + trace_xchk_iscan_iget(iscan, error); + + if (error == -ENOENT || error == -EAGAIN) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * It's possible that this inode has lost all of its links but + * hasn't yet been inactivated. If we don't have a transaction + * or it's not writable, flush the inodegc workers and wait. + */ + xfs_inodegc_flush(mp); + return xchk_iscan_iget_retry(iscan, true); + } + + if (error == -EINVAL) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + /* + * We thought the inode was allocated, but the inode btree + * lookup failed, which means that it was freed since the last + * time we advanced the cursor. Back up and try again. This + * should never happen since still hold the AGI buffer from the + * inobt check, but we need to be careful about infinite loops. + */ + return xchk_iscan_iget_retry(iscan, false); + } + + if (error) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return error; + } + idx++; + ino++; + allocmask >>= 1; + + /* + * Now that we've filled the first slot in __inodes, try to fill the + * rest of the batch with consecutively ordered inodes. to reduce the + * number of _iter calls. Make a bitmap of unallocated inodes from the + * zeroes in the inuse bitmap; these inodes will not be scanned, but + * the _want_live_update predicate will pass through all live updates. + * + * If we can't iget an allocated inode, stop and return what we have. + */ + mutex_lock(&iscan->lock); + iscan->__batch_ino = ino - 1; + iscan->__skipped_inomask = 0; + mutex_unlock(&iscan->lock); + + for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { + if (!(allocmask & 1)) { + ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + iscan->__skipped_inomask |= (1ULL << i); + mutex_unlock(&iscan->lock); + continue; + } + + ASSERT(iscan->__inodes[idx] == NULL); + + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + if (error) + break; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + mutex_unlock(&iscan->lock); + idx++; + } + + trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return idx; +} + +/* + * Advance the visit cursor to reflect skipped inodes beyond whatever we + * scanned. + */ +STATIC void +xchk_iscan_finish_batch( + struct xchk_iscan *iscan) +{ + xfs_ino_t highest_skipped; + + mutex_lock(&iscan->lock); + + if (iscan->__batch_ino != NULLFSINO) { + highest_skipped = iscan->__batch_ino + + xfs_highbit64(iscan->__skipped_inomask); + iscan->__visited_ino = max(iscan->__visited_ino, + highest_skipped); + + trace_xchk_iscan_skip(iscan); + } + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance the inode scan cursor to the next allocated inode and return up to + * 64 consecutive allocated inodes starting with the cursor position. + */ +STATIC int +xchk_iscan_iter_batch( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + int ret; + + xchk_iscan_finish_batch(iscan); + + if (iscan->iget_timeout) + iscan->__iget_deadline = jiffies + + msecs_to_jiffies(iscan->iget_timeout); + + do { + struct xfs_buf *agi_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_inofree_t allocmask = 0; + uint8_t nr_inodes = 0; + + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, + &nr_inodes); + if (ret != 1) + return ret; + + if (xchk_iscan_aborted(iscan)) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + ret = -ECANCELED; + break; + } + + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + unsigned int i; + int error; + + /* Find a cached inode, or go get another batch. */ + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) + goto foundit; + } + + error = xchk_iscan_iter_batch(iscan); + if (error <= 0) + return error; + + ASSERT(iscan->__inodes[0] != NULL); + i = 0; + +foundit: + /* Give the caller our reference. */ + *ipp = iscan->__inodes[i]; + iscan->__inodes[i] = NULL; + return 1; +} + +/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ +void +xchk_iscan_iter_finish( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) { + xchk_irele(sc, iscan->__inodes[i]); + iscan->__inodes[i] = NULL; + } + } +} + +/* Mark this inode scan finished and release resources. */ +void +xchk_iscan_teardown( + struct xchk_iscan *iscan) +{ + xchk_iscan_iter_finish(iscan); + xchk_iscan_finish(iscan); + mutex_destroy(&iscan->lock); +} + +/* Pick an AG from which to start a scan. */ +static inline xfs_ino_t +xchk_iscan_rotor( + struct xfs_mount *mp) +{ + static atomic_t agi_rotor; + unsigned int r = atomic_inc_return(&agi_rotor) - 1; + + /* + * Rotoring *backwards* through the AGs, so we add one here before + * subtracting from the agcount to arrive at an AG number. + */ + r = (r % mp->m_sb.sb_agcount) + 1; + + return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); +} + +/* + * Set ourselves up to start an inode scan. If the @iget_timeout and + * @iget_retry_delay parameters are set, the scan will try to iget each inode + * for @iget_timeout milliseconds. If an iget call indicates that the inode is + * waiting to be inactivated, the CPU will relax for @iget_retry_delay + * milliseconds after pushing the inactivation workers. + */ +void +xchk_iscan_start( + struct xfs_scrub *sc, + unsigned int iget_timeout, + unsigned int iget_retry_delay, + struct xchk_iscan *iscan) +{ + xfs_ino_t start_ino; + + start_ino = xchk_iscan_rotor(sc->mp); + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + iscan->sc = sc; + clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); + iscan->iget_timeout = iget_timeout; + iscan->iget_retry_delay = iget_retry_delay; + iscan->__visited_ino = start_ino; + iscan->cursor_ino = start_ino; + iscan->scan_start_ino = start_ino; + mutex_init(&iscan->lock); + memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); + + trace_xchk_iscan_start(iscan, start_ino); +} + +/* + * Mark this inode as having been visited. Callers must hold a sufficiently + * exclusive lock on the inode to prevent concurrent modifications. + */ +void +xchk_iscan_mark_visited( + struct xchk_iscan *iscan, + struct xfs_inode *ip) +{ + mutex_lock(&iscan->lock); + iscan->__visited_ino = ip->i_ino; + trace_xchk_iscan_visit(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Did we skip this inode because it wasn't allocated when we loaded the batch? + * If so, it is newly allocated and will not be scanned. All live updates to + * this inode must be passed to the caller to maintain scan correctness. + */ +static inline bool +xchk_iscan_skipped( + const struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + if (iscan->__batch_ino == NULLFSINO) + return false; + if (ino < iscan->__batch_ino) + return false; + if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) + return false; + + return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); +} + +/* + * Do we need a live update for this inode? This is true if the scanner thread + * has visited this inode and the scan hasn't been aborted due to errors. + * Callers must hold a sufficiently exclusive lock on the inode to prevent + * scanners from reading any inode metadata. + */ +bool +xchk_iscan_want_live_update( + struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + bool ret = false; + + if (xchk_iscan_aborted(iscan)) + return false; + + mutex_lock(&iscan->lock); + + trace_xchk_iscan_want_live_update(iscan, ino); + + /* Scan is finished, caller should receive all updates. */ + if (iscan->__visited_ino == NULLFSINO) { + ret = true; + goto unlock; + } + + /* + * No inodes have been visited yet, so the visited cursor points at the + * start of the scan range. The caller should not receive any updates. + */ + if (iscan->scan_start_ino == iscan->__visited_ino) { + ret = false; + goto unlock; + } + + /* + * This inode was not allocated at the time of the iscan batch. + * The caller should receive all updates. + */ + if (xchk_iscan_skipped(iscan, ino)) { + ret = true; + goto unlock; + } + + /* + * The visited cursor hasn't yet wrapped around the end of the FS. If + * @ino is inside the starred range, the caller should receive updates: + * + * 0 ------------ S ************ V ------------ EOFS + */ + if (iscan->scan_start_ino <= iscan->__visited_ino) { + if (ino >= iscan->scan_start_ino && + ino <= iscan->__visited_ino) + ret = true; + + goto unlock; + } + + /* + * The visited cursor wrapped around the end of the FS. If @ino is + * inside the starred range, the caller should receive updates: + * + * 0 ************ V ------------ S ************ EOFS + */ + if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) + ret = true; + +unlock: + mutex_unlock(&iscan->lock); + return ret; +} diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h new file mode 100644 index 0000000000..71f657552d --- /dev/null +++ b/fs/xfs/scrub/iscan.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_ISCAN_H__ +#define __XFS_SCRUB_ISCAN_H__ + +struct xchk_iscan { + struct xfs_scrub *sc; + + /* Lock to protect the scan cursor. */ + struct mutex lock; + + /* + * This is the first inode in the inumber address space that we + * examined. When the scan wraps around back to here, the scan is + * finished. + */ + xfs_ino_t scan_start_ino; + + /* This is the inode that will be examined next. */ + xfs_ino_t cursor_ino; + + /* If nonzero and non-NULL, skip this inode when scanning. */ + xfs_ino_t skip_ino; + + /* + * This is the last inode that we've successfully scanned, either + * because the caller scanned it, or we moved the cursor past an empty + * part of the inode address space. Scan callers should only use the + * xchk_iscan_visit function to modify this. + */ + xfs_ino_t __visited_ino; + + /* Operational state of the livescan. */ + unsigned long __opstate; + + /* Give up on iterating @cursor_ino if we can't iget it by this time. */ + unsigned long __iget_deadline; + + /* Amount of time (in ms) that we will try to iget an inode. */ + unsigned int iget_timeout; + + /* Wait this many ms to retry an iget. */ + unsigned int iget_retry_delay; + + /* + * The scan grabs batches of inodes and stashes them here before + * handing them out with _iter. Unallocated inodes are set in the + * mask so that all updates to that inode are selected for live + * update propagation. + */ + xfs_ino_t __batch_ino; + xfs_inofree_t __skipped_inomask; + struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; +}; + +/* Set if the scan has been aborted due to some event in the fs. */ +#define XCHK_ISCAN_OPSTATE_ABORTED (1) + +static inline bool +xchk_iscan_aborted(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline void +xchk_iscan_abort(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, + unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_teardown(struct xchk_iscan *iscan); + +int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); +void xchk_iscan_iter_finish(struct xchk_iscan *iscan); + +void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); +bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); + +#endif /* __XFS_SCRUB_ISCAN_H__ */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index bb6d980b4f..4a0271123d 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -239,7 +239,11 @@ xrep_newbt_alloc_ag_blocks( xrep_newbt_validate_ag_alloc_hint(xnr); - error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_near_bno(&args, + xnr->alloc_hint); if (error) return error; if (args.fsbno == NULLFSBLOCK) @@ -309,7 +313,11 @@ xrep_newbt_alloc_file_blocks( xrep_newbt_validate_file_alloc_hint(xnr); - error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_start_ag(&args, + xnr->alloc_hint); if (error) return error; if (args.fsbno == NULLFSBLOCK) @@ -535,7 +543,7 @@ xrep_newbt_claim_block( trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, xnr->oinfo.oi_owner); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, agbno)); else diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index 89f8e3970b..3d804d31af 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_NEWBT_H__ #define __XFS_SCRUB_NEWBT_H__ +struct xfs_alloc_arg; + struct xrep_newbt_resv { /* Link to list of extents that we've reserved. */ struct list_head list; @@ -28,6 +30,11 @@ struct xrep_newbt_resv { struct xrep_newbt { struct xfs_scrub *sc; + /* Custom allocation function, or NULL for xfs_alloc_vextent */ + int (*alloc_vextent)(struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint); + /* List of extents that we've reserved. */ struct list_head resv_list; diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 0000000000..8a7d955789 --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,930 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); + + sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting file link counts. For each file, we create a shadow link + * counting structure, then walk the entire directory tree, incrementing parent + * and child link counts for each directory entry seen. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all + * directory entry updates because that is when link count updates occur. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. Because the hook code runs in a different process context from the + * scrub code and the scrub state flags are not accessed atomically, failures + * in the hook code must abort the iscan and the scrubber must notice the + * aborted scan and set the incomplete flag. + * + * Note that we use jump labels and srcu notifier hooks to minimize the + * overhead when live nlinks is /not/ running. Locking order for nlink + * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* + * Add a delta to an nlink counter, clamping the value to U32_MAX. Because + * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results + * even if we lose some precision. + */ +static inline void +careful_add( + xfs_nlink_t *nlinkp, + int delta) +{ + uint64_t new_value = (uint64_t)(*nlinkp) + delta; + + BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); + *nlinkp = min_t(uint64_t, new_value, U32_MAX); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + int parents_delta, + int backrefs_delta, + int children_delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, + backrefs_delta, children_delta); + + careful_add(&nl.parents, parents_delta); + careful_add(&nl.backrefs, backrefs_delta); + careful_add(&nl.children, children_delta); + + nl.flags |= XCHK_NLINK_WRITTEN; + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure based on a directory update in progress. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_nlink_ctrs *xnc; + int error; + + xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, + p->delta, p->name->name, p->name->len); + + /* + * If we've already scanned @dp, update the number of parents that link + * to @ip. If @ip is a subdirectory, update the number of child links + * going out of @dp. + */ + if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, + 0, 0); + if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + 0, p->delta); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + /* + * If @ip is a subdirectory and we've already scanned it, update the + * number of backrefs pointing to @dp. + */ + if (S_ISDIR(VFS_IC(p->ip)->i_mode) && + xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + p->delta, 0); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); + return NOTIFY_DONE; +} + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_collect_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + bool dot = false, dotdot = false; + int error; + + /* Does this name make sense? */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { + error = -ECANCELED; + goto out_abort; + } + + if (name->len == 1 && name->name[0] == '.') + dot = true; + else if (name->len == 2 && name->name[0] == '.' && + name->name[1] == '.') + dotdot = true; + + /* Don't accept a '.' entry that points somewhere else. */ + if (dot && ino != dp->i_ino) { + error = -ECANCELED; + goto out_abort; + } + + /* Don't accept an invalid inode number. */ + if (!xfs_verify_dir_ino(sc->mp, ino)) { + error = -ECANCELED; + goto out_abort; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); + + mutex_lock(&xnc->lock); + + /* + * If this is a dotdot entry, it is a back link from dp to ino. How + * we handle this depends on whether or not dp is the root directory. + * + * The root directory is its own parent, so we pretend the dotdot entry + * establishes the "parent" of the root directory. Increment the + * number of parents of the root directory. + * + * Otherwise, increment the number of backrefs pointing back to ino. + */ + if (dotdot) { + if (dp == sc->mp->m_rootip) + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + else + error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link from dp to ino, increment the + * number of parents linking into ino. + */ + if (!dot && !dotdot) { + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link to a subdirectory, increment the + * number of child links of dp. + */ + if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { + error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); + if (error) + goto out_unlock; + } + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Walk a directory to bump the observed link counts of the children. */ +STATIC int +xchk_nlinks_collect_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + unsigned int lock_mode; + int error = 0; + + /* Prevent anyone from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out_unlock; + + /* + * We cannot count file links if the directory looks as though it has + * been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_abort; + } + + error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); + goto out_unlock; + +out_abort: + xchk_set_incomplete(sc); + xchk_iscan_abort(&xnc->collect_iscan); +out_unlock: + xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_collect_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_collect_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this non-directory file. */ +static inline int +xchk_nlinks_collect_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + int error; + + /* Count the rt and quota files that are rooted in the superblock. */ + error = xchk_nlinks_collect_metafiles(xnc); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_collect_dir(xnc, ip); + else + error = xchk_nlinks_collect_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->collect_iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing file link counters. Walk each inode and compare the link + * counts against our shadow information; and then walk each shadow link count + * structure (that wasn't covered in the first part), comparing it against the + * file. + */ + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * shouldn't really happen outside of the collection phase. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + /* Copy the counters, but do not expose the internal state. */ + obs->parents = nl.parents; + obs->backrefs = nl.backrefs; + obs->children = nl.children; + obs->flags = 0; + return 0; +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * If we don't have ftype to get an accurate count of the subdirectory + * entries in this directory, take advantage of the fact that on a + * consistent ftype=0 filesystem, the number of subdirectory + * backreferences (dotdot entries) pointing towards this directory + * should be equal to the number of subdirectory entries in the + * directory. + */ + if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) + obs.children = obs.backrefs; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* + * If we found so many parents that we'd overflow i_nlink, we must flag + * this as a corruption. The VFS won't let users increase the link + * count, but it will let them decrease it. + */ + if (total_links > XFS_MAXLINK) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* Link counts should match. */ + if (total_links != actual_nlink) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { + /* + * The collection phase ignores directories with zero link + * count, so we ignore them here too. + * + * The number of subdirectory backreferences (dotdot entries) + * pointing towards this directory should be equal to the + * number of subdirectory entries in the directory. + */ + if (obs.children != obs.backrefs) + xchk_ino_xref_set_corrupt(sc, ip->i_ino); + } else { + /* + * Non-directories and unlinked directories should not have + * back references. + */ + if (obs.backrefs != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* + * Non-directories and unlinked directories should not have + * children. + */ + if (obs.children != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + + if (ip == sc->mp->m_rootip) { + /* + * For the root of a directory tree, both the '.' and '..' + * entries should point to the root directory. The dotdot + * entry is counted as a parent of the root /and/ a backref of + * the root directory. + */ + if (obs.parents != 1) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } else if (actual_nlink > 0) { + /* + * Linked files that are not the root directory should have at + * least one parent. + */ + if (obs.parents == 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + +out_corrupt: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -ECANCELED; +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * The first iget failed, so try again with the variant that returns + * either an incore inode or the AGI buffer. If the function returns + * EINVAL/ENOENT, it should have passed us the AGI buffer so that we + * can guarantee that the inode won't be allocated while we check for + * a zero link count in the observed link count data. + */ + error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); + if (!error) { + /* Actually got an inode, so use the inode compare. */ + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found. Check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(NULL, &obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + if (agi_bp) + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* + * Try to visit every inode in the filesystem to compare the link count. Move + * on if we can't grab an inode, since we'll revisit unchecked nlink records in + * the second part. + */ +static int +xchk_nlinks_compare_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. Inodes skipped by _compare_iter will be tried again in the + * next phase of the scan. + */ + xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); + while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) { + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { + xfs_ino_t ino = cur - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_teardown(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned long long max_inos; + xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; + xfs_agino_t first_agino, last_agino; + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); + + /* + * Set up enough space to store an nlink record for the highest + * possible inode number in this system. + */ + xfs_agino_range(mp, last_agno, &first_agino, &last_agino); + max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; + descr = xchk_xfile_descr(sc, "file link counts"); + error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), + sizeof(struct xchk_nlink), &xnc->nlinks); + kfree(descr); + if (error) + goto out_teardown; + + /* + * Hook into the directory entry code so that we can capture updates to + * file link counts. The hook only triggers for inodes that were + * already scanned, and the scanner thread takes each inode's ILOCK, + * which means that any in-progress inode updates will finish before we + * can scan the inode. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); + error = xfs_dir_hook_add(mp, &xnc->dhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 0000000000..a950f3daf2 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; +}; + +/* + * In-core link counts for a given inode in the filesystem. + * + * For an empty rootdir, the directory entries and the field to which they are + * accounted are as follows: + * + * Root directory: + * + * . points to self (root.child) + * .. points to self (root.parent) + * f1 points to a child file (f1.parent) + * d1 points to a child dir (d1.parent, root.child) + * + * Subdirectory d1: + * + * . points to self (d1.child) + * .. points to root dir (root.backref) + * f2 points to child file (f2.parent) + * f3 points to root.f1 (f1.parent) + * + * root.nlink == 3 (root.dot, root.dotdot, root.d1) + * d1.nlink == 2 (root.d1, d1.dot) + * f1.nlink == 2 (root.f1, d1.f3) + * f2.nlink == 1 (d1.f2) + */ +struct xchk_nlink { + /* Count of forward links from parent directories to this file. */ + xfs_nlink_t parents; + + /* + * Count of back links to this parent directory from child + * subdirectories. + */ + xfs_nlink_t backrefs; + + /* + * Count of forward links from this directory to all child files and + * the number of dot entries. Should be zero for non-directories. + */ + xfs_nlink_t children; + + /* Record state flags */ + unsigned int flags; +}; + +/* + * This incore link count has been written at least once. We never want to + * store an xchk_nlink that looks uninitialized. + */ +#define XCHK_NLINK_WRITTEN (1U << 0) + +/* Already checked this link count record. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 1) + +/* Already made a repair with this link count record. */ +#define XREP_NLINK_DIRTY (1U << 2) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) +{ + uint64_t ret = live->parents; + + /* Add one link count for the dot entry of any linked directory. */ + if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink) + ret++; + return ret + live->children; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c new file mode 100644 index 0000000000..b87618322f --- /dev/null +++ b/fs/xfs/scrub/nlinks_repair.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Repair + * ============================ + * + * Use the live inode link count information that we collected to replace the + * nlink values of the incore inodes. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * inode is locked. + */ + +/* + * Correct the link count of the given inode. Because we have to grab locks + * and resources in a certain order, it's possible that this will be a no-op. + */ +STATIC int +xrep_nlinks_repair_inode( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + uint64_t total_links; + uint64_t actual_nlink; + bool dirty = false; + int error; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_scanlock; + } + + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * We're done accessing the shared scan data, so we can drop the lock. + * We still hold @ip's ILOCK, so its link count cannot change. + */ + mutex_unlock(&xnc->lock); + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + /* + * Non-directories cannot have directories pointing up to them. + * + * We previously set error to zero, but set it again because one static + * checker author fears that programmers will fail to maintain this + * invariant and built their tool to flag this as a security risk. A + * different tool author made their bot complain about the redundant + * store. This is a never-ending and stupid battle; both tools missed + * *actual bugs* elsewhere; and I no longer care. + */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + error = 0; + goto out_trans; + } + + /* + * We did not find any links to this inode. If the inode agrees, we + * have nothing further to do. If not, the inode has a nonzero link + * count and we don't have anywhere to graft the child onto. Dropping + * a live inode's link count to zero can cause unexpected shutdowns in + * inactivation, so leave it alone. + */ + if (total_links == 0) { + if (actual_nlink != 0) + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { + if (total_links > XFS_MAXLINK) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + trace_xrep_nlinks_update_inode(mp, ip, &obs); + + set_nlink(VFS_I(ip), total_links); + dirty = true; + } + + if (!dirty) { + error = 0; + goto out_trans; + } + + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + + error = xrep_trans_commit(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +out_scanlock: + mutex_unlock(&xnc->lock); +out_trans: + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; +} + +/* + * Try to visit every inode in the filesystem for repairs. Move on if we can't + * grab an inode, since we're still making forward progress. + */ +static int +xrep_nlinks_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Commit the new inode link counters. */ +int +xrep_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error; + + /* + * We need ftype for an accurate count of the number of child + * subdirectory links. Child subdirectories with a back link (dotdot + * entry) but no forward link are unfixable, so we cannot repair the + * link count of the parent directory based on the back link count + * alone. Filesystems without ftype support are rare (old V4) so we + * just skip out here. + */ + if (!xfs_has_ftype(sc->mp)) + return -EOPNOTSUPP; + + /* + * Use the inobt to walk all allocated inodes to compare and fix the + * link counts. Retry iget every tenth of a second for up to 30 + * seconds -- even if repair misses a few inodes, we still try to fix + * as many of them as we can. + */ + xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan); + ASSERT(sc->ip == NULL); + + while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) { + /* + * Commit the scrub transaction so that we can create repair + * transactions with the correct reservations. + */ + xchk_trans_cancel(sc); + + error = xrep_nlinks_repair_inode(xnc); + xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + + /* + * Create a new empty transaction so that we can advance the + * iscan cursor without deadlocking if the inobt has a cycle. + * We can only push the inactivation workqueues with an empty + * transaction. + */ + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + + return error; +} diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c new file mode 100644 index 0000000000..c77eb2de8d --- /dev/null +++ b/fs/xfs/scrub/quotacheck.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck + * =============== + * + * Quota counters are "summary" metadata, in the sense that they are computed + * as the summation of the block usage counts for every file on the filesystem. + * Therefore, we compute the correct icount, bcount, and rtbcount values by + * creating a shadow quota counter structure and walking every inode. + */ + +/* Track the quota deltas for a dquot in a transaction. */ +struct xqcheck_dqtrx { + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + + int64_t icount_delta; + + int64_t bcount_delta; + int64_t delbcnt_delta; + + int64_t rtbcount_delta; + int64_t delrtb_delta; +}; + +#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS) + +/* + * Track the quota deltas for all dquots attached to a transaction if the + * quota deltas are being applied to an inode that we already scanned. + */ +struct xqcheck_dqacct { + struct rhash_head hash; + uintptr_t tx_id; + struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS]; + unsigned int refcount; +}; + +/* Free a shadow dquot accounting structure. */ +static void +xqcheck_dqacct_free( + void *ptr, + void *arg) +{ + struct xqcheck_dqacct *dqa = ptr; + + kfree(dqa); +} + +/* Set us up to scrub quota counters. */ +int +xchk_setup_quotacheck( + struct xfs_scrub *sc) +{ + if (!XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; + + xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA); + + sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached + * to each inode, we create a shadow dquot, and compute the inode count and add + * the data/rt block usage from what we see. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the quota counters for an inode that we've already + * scanned. This will cause our counts to be incorrect. Therefore, we hook + * the live transaction code in two places: (1) when the callers update the + * per-transaction dqtrx structure to log quota counter updates; and (2) when + * transaction commit actually logs those updates to the incore dquot. By + * shadowing transaction updates in this manner, live quotacheck can ensure + * by locking the dquot and the shadow structure that its own copies are not + * out of date. Because the hook code runs in a different process context from + * the scrub code and the scrub state flags are not accessed atomically, + * failures in the hook code must abort the iscan and the scrubber must notice + * the aborted scan and set the incomplete flag. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * quotacheck is /not/ running. + */ + +/* Update an incore dquot counter information from a live update. */ +static int +xqcheck_update_incore_counts( + struct xqcheck *xqc, + struct xfarray *counts, + xfs_dqid_t id, + int64_t inodes, + int64_t nblks, + int64_t rtblks) +{ + struct xqcheck_dquot xcdq; + int error; + + error = xfarray_load_sparse(counts, id, &xcdq); + if (error) + return error; + + xcdq.flags |= XQCHECK_DQUOT_WRITTEN; + xcdq.icount += inodes; + xcdq.bcount += nblks; + xcdq.rtbcount += rtblks; + + error = xfarray_store(counts, id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* Decide if this is the shadow dquot accounting structure for a transaction. */ +static int +xqcheck_dqacct_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const uintptr_t *tx_idp = arg->key; + const struct xqcheck_dqacct *dqa = obj; + + if (dqa->tx_id != *tx_idp) + return 1; + return 0; +} + +static const struct rhashtable_params xqcheck_dqacct_hash_params = { + .min_size = 32, + .key_len = sizeof(uintptr_t), + .key_offset = offsetof(struct xqcheck_dqacct, tx_id), + .head_offset = offsetof(struct xqcheck_dqacct, hash), + .automatic_shrinking = true, + .obj_cmpfn = xqcheck_dqacct_obj_cmpfn, +}; + +/* Find a shadow dqtrx slot for the given dquot. */ +STATIC struct xqcheck_dqtrx * +xqcheck_get_dqtrx( + struct xqcheck_dqacct *dqa, + xfs_dqtype_t q_type, + xfs_dqid_t q_id) +{ + int i; + + for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) { + if (dqa->dqtrx[i].q_type == 0 || + (dqa->dqtrx[i].q_type == q_type && + dqa->dqtrx[i].q_id == q_id)) + return &dqa->dqtrx[i]; + } + + return NULL; +} + +/* + * Create and fill out a quota delta tracking structure to shadow the updates + * going on in the regular quota code. + */ +static int +xqcheck_mod_live_ino_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_mod_ino_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb); + + /* Skip quota reservation fields. */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_ICOUNT: + case XFS_TRANS_DQ_RTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: + break; + default: + return NOTIFY_DONE; + } + + /* Ignore dqtrx updates for quota types we don't care about. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + if (!xqc->ucounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_GROUP: + if (!xqc->gcounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_PROJ: + if (!xqc->pcounts) + return NOTIFY_DONE; + break; + default: + return NOTIFY_DONE; + } + + /* Skip inodes that haven't been scanned yet. */ + if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino)) + return NOTIFY_DONE; + + /* Make a shadow quota accounting tracker for this transaction. */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) { + dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS); + if (!dqa) + goto out_abort; + + dqa->tx_id = p->tx_id; + error = rhashtable_insert_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + } + + /* Find the shadow dqtrx (or an empty slot) here. */ + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx) + goto out_abort; + if (dqtrx->q_type == 0) { + dqtrx->q_type = p->q_type; + dqtrx->q_id = p->q_id; + dqa->refcount++; + } + + /* Update counter */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + dqtrx->bcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELBCOUNT: + dqtrx->delbcnt_delta += p->delta; + break; + case XFS_TRANS_DQ_ICOUNT: + dqtrx->icount_delta += p->delta; + break; + case XFS_TRANS_DQ_RTBCOUNT: + dqtrx->rtbcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELRTBCOUNT: + dqtrx->delrtb_delta += p->delta; + break; + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* + * Apply the transaction quota deltas to our shadow quota accounting info when + * the regular quota code are doing the same. + */ +static int +xqcheck_apply_live_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_apply_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + struct xfarray *counts; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb); + + /* Map the dquot type to an incore counter object. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + counts = xqc->ucounts; + break; + case XFS_DQTYPE_GROUP: + counts = xqc->gcounts; + break; + case XFS_DQTYPE_PROJ: + counts = xqc->pcounts; + break; + default: + return NOTIFY_DONE; + } + + if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL) + return NOTIFY_DONE; + + /* + * Find the shadow dqtrx for this transaction and dquot, if any deltas + * need to be applied here. If not, we're finished early. + */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) + goto out_unlock; + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx || dqtrx->q_type == 0) + goto out_unlock; + + /* Update our shadow dquot if we're committing. */ + if (action == XFS_APPLY_DQTRX_COMMIT) { + error = xqcheck_update_incore_counts(xqc, counts, p->q_id, + dqtrx->icount_delta, + dqtrx->bcount_delta + dqtrx->delbcnt_delta, + dqtrx->rtbcount_delta + dqtrx->delrtb_delta); + if (error) + goto out_abort; + } + + /* Free the shadow accounting structure if that was the last user. */ + dqa->refcount--; + if (dqa->refcount == 0) { + error = rhashtable_remove_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + xqcheck_dqacct_free(dqa, NULL); + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_unlock: + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* Record this inode's quota usage in our shadow quota counter data. */ +STATIC int +xqcheck_collect_inode( + struct xqcheck *xqc, + struct xfs_inode *ip) +{ + struct xfs_trans *tp = xqc->sc->tp; + xfs_filblks_t nblks, rtblks; + uint ilock_flags = 0; + xfs_dqid_t id; + bool isreg = S_ISREG(VFS_I(ip)->i_mode); + int error = 0; + + if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + /* + * Quota files are never counted towards quota, so we do not + * need to take the lock. + */ + xchk_iscan_mark_visited(&xqc->iscan, ip); + return 0; + } + + /* Figure out the data / rt device block counts. */ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (isreg) + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Read in the data fork for rt files so that _count_blocks + * can count the number of blocks allocated from the rt volume. + * Inodes do not track that separately. + */ + ilock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_abort; + } else { + ilock_flags = XFS_ILOCK_SHARED; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + /* Update the shadow dquot counters. */ + mutex_lock(&xqc->lock); + if (xqc->ucounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER); + error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->gcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP); + error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->pcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ); + error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + mutex_unlock(&xqc->lock); + + xchk_iscan_mark_visited(&xqc->iscan, ip); + goto out_ilock; + +out_mutex: + mutex_unlock(&xqc->lock); +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_incomplete: + xchk_set_incomplete(xqc->sc); +out_ilock: + xfs_iunlock(ip, ilock_flags); + if (isreg) + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; +} + +/* Walk all the allocated inodes and run a quota scan on them. */ +STATIC int +xqcheck_collect_counts( + struct xqcheck *xqc) +{ + struct xfs_scrub *sc = xqc->sc; + struct xfs_inode *ip; + int error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done without + * risk of deadlock between sb_internal and the IOLOCK (we take the + * IOLOCK to quiesce the file before scanning) because empty + * transactions do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { + error = xqcheck_collect_inode(xqc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xqc->iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing + * the resource usage counters against our shadow dquots; and then walk each + * shadow dquot (that wasn't covered in the first part), comparing it against + * the xfs_dquot. + */ + +/* + * Check the dquot data against what we observed. Caller must hold the dquot + * lock. + */ +STATIC int +xqcheck_compare_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int error; + + if (xchk_iscan_aborted(&xqc->iscan)) { + xchk_set_incomplete(xqc->sc); + return -ECANCELED; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + if (xcdq.icount != dq->q_ino.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.bcount != dq->q_blk.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.rtbcount != dq->q_rtb.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * should never happen outside of the collection phase. + */ + xchk_set_incomplete(xqc->sc); + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error) + return error; + + if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; + +out_unlock: + mutex_unlock(&xqc->lock); + return error; +} + +/* + * Walk all the observed dquots, and make sure there's a matching incore + * dquot and that its counts match ours. + */ +STATIC int +xqcheck_walk_observations( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xqcheck_dquot xcdq; + struct xfs_dquot *dq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq); + if (error == -ENOENT) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, id); + return 0; + } + if (error) + return error; + + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + if (xchk_should_terminate(xqc->sc, &error)) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Compare the quota counters we observed against the live dquots. */ +STATIC int +xqcheck_compare_dqtype( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xfs_scrub *sc = xqc->sc; + struct xfs_dquot *dq; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* If the quota CHKD flag is cleared, we need to repair this quota. */ + if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0); + return 0; + } + + /* Compare what we observed against the actual dquots. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Walk all the observed dquots and compare to the incore ones. */ + return xqcheck_walk_observations(xqc, dqtype); +} + +/* Tear down everything associated with a quotacheck. */ +static void +xqcheck_teardown_scan( + void *priv) +{ + struct xqcheck *xqc = priv; + struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xqc->iscan); + + /* + * As noted above, the apply hook is responsible for cleaning up the + * shadow dquot accounting data when a transaction completes. The mod + * hook must be removed before the apply hook so that we don't + * mistakenly leave an active shadow account for the mod hook to get + * its hands on. No hooks should be running after these functions + * return. + */ + xfs_dqtrx_hook_del(qi, &xqc->qhook); + + if (xqc->shadow_dquot_acct.key_len) { + rhashtable_free_and_destroy(&xqc->shadow_dquot_acct, + xqcheck_dqacct_free, NULL); + xqc->shadow_dquot_acct.key_len = 0; + } + + if (xqc->pcounts) { + xfarray_destroy(xqc->pcounts); + xqc->pcounts = NULL; + } + + if (xqc->gcounts) { + xfarray_destroy(xqc->gcounts); + xqc->gcounts = NULL; + } + + if (xqc->ucounts) { + xfarray_destroy(xqc->ucounts); + xqc->ucounts = NULL; + } + + xchk_iscan_teardown(&xqc->iscan); + mutex_destroy(&xqc->lock); + xqc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate quota counter data. + * If the scan is successful, the quota data will be left alive for a repair. + * If any error occurs, we'll tear everything down. + */ +STATIC int +xqcheck_setup_scan( + struct xfs_scrub *sc, + struct xqcheck *xqc) +{ + char *descr; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL; + int error; + + ASSERT(xqc->sc == NULL); + xqc->sc = sc; + + mutex_init(&xqc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xqc->iscan); + + error = -ENOMEM; + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) { + descr = xchk_xfile_descr(sc, "user dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->ucounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) { + descr = xchk_xfile_descr(sc, "group dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->gcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) { + descr = xchk_xfile_descr(sc, "project dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->pcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + /* + * Set up hash table to map transactions to our internal shadow dqtrx + * structures. + */ + error = rhashtable_init(&xqc->shadow_dquot_acct, + &xqcheck_dqacct_hash_params); + if (error) + goto out_teardown; + + /* + * Hook into the quota code. The hook only triggers for inodes that + * were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + * + * The apply hook (which removes the shadow dquot accounting struct) + * must be installed before the mod hook so that we never fail to catch + * the end of a quota update sequence and leave stale shadow data. + */ + ASSERT(sc->flags & XCHK_FSGATES_QUOTA); + xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx, + xqcheck_apply_live_dqtrx); + + error = xfs_dqtrx_hook_add(qi, &xqc->qhook); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the quota count data to repair. */ + sc->buf_cleanup = xqcheck_teardown_scan; + return 0; + +out_teardown: + xqcheck_teardown_scan(xqc); + return error; +} + +/* Scrub all counters for a given quota type. */ +int +xchk_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + int error = 0; + + /* Check quota counters on the live filesystem. */ + error = xqcheck_setup_scan(sc, xqc); + if (error) + return error; + + /* Walk all inodes, picking up quota information. */ + error = xqcheck_collect_counts(xqc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare quota counters. */ + if (xqc->ucounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->gcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->pcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h new file mode 100644 index 0000000000..4ea5f249c9 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_QUOTACHECK_H__ +#define __XFS_SCRUB_QUOTACHECK_H__ + +/* Quota counters for live quotacheck. */ +struct xqcheck_dquot { + /* block usage count */ + int64_t bcount; + + /* inode usage count */ + int64_t icount; + + /* realtime block usage count */ + int64_t rtbcount; + + /* Record state */ + unsigned int flags; +}; + +/* + * This incore dquot record has been written at least once. We never want to + * store an xqcheck_dquot that looks uninitialized. + */ +#define XQCHECK_DQUOT_WRITTEN (1U << 0) + +/* Already checked this dquot. */ +#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1) + +/* Already repaired this dquot. */ +#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2) + +/* Live quotacheck control structure. */ +struct xqcheck { + struct xfs_scrub *sc; + + /* Shadow dquot counter data. */ + struct xfarray *ucounts; + struct xfarray *gcounts; + struct xfarray *pcounts; + + /* Lock protecting quotacheck count observations */ + struct mutex lock; + + struct xchk_iscan iscan; + + /* Hooks into the quota code. */ + struct xfs_dqtrx_hook qhook; + + /* Shadow quota delta tracking structure. */ + struct rhashtable shadow_dquot_acct; +}; + +/* Return the incore counter array for a given quota type. */ +static inline struct xfarray * +xqcheck_counters_for( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + switch (dqtype) { + case XFS_DQTYPE_USER: + return xqc->ucounts; + case XFS_DQTYPE_GROUP: + return xqc->gcounts; + case XFS_DQTYPE_PROJ: + return xqc->pcounts; + } + + ASSERT(0); + return NULL; +} + +#endif /* __XFS_SCRUB_QUOTACHECK_H__ */ diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c new file mode 100644 index 0000000000..dd8554c755 --- /dev/null +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck Repair + * ====================== + * + * Use the live quota counter information that we collected to replace the + * counter values in the incore dquots. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * dquot is locked. + */ + +/* Commit new counters to a dquot. */ +static int +xqcheck_commit_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int64_t delta; + bool dirty = false; + int error = 0; + + /* Unlock the dquot just long enough to allocate a transaction. */ + xfs_dqunlock(dq); + error = xchk_trans_alloc(xqc->sc, 0); + xfs_dqlock(dq); + if (error) + return error; + + xfs_trans_dqjoin(xqc->sc->tp, dq); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_cancel; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + /* Adjust counters as needed. */ + delta = (int64_t)xcdq.icount - dq->q_ino.count; + if (delta) { + dq->q_ino.reserved += delta; + dq->q_ino.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.bcount - dq->q_blk.count; + if (delta) { + dq->q_blk.reserved += delta; + dq->q_blk.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count; + if (delta) { + dq->q_rtb.reserved += delta; + dq->q_rtb.count += delta; + dirty = true; + } + + xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the repair + * and must cancel the whole operation. This should never + * happen, but we need to catch it anyway. + */ + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error || !dirty) + goto out_cancel; + + trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id); + + /* Commit the dirty dquot to disk. */ + dq->q_flags |= XFS_DQFLAG_DIRTY; + if (dq->q_id) + xfs_qm_adjust_dqtimers(dq); + xfs_trans_log_dquot(xqc->sc->tp, dq); + + /* + * Transaction commit unlocks the dquot, so we must re-lock it so that + * the caller can put the reference (which apparently requires a locked + * dquot). + */ + error = xrep_trans_commit(xqc->sc); + xfs_dqlock(dq); + return error; + +out_unlock: + mutex_unlock(&xqc->lock); +out_cancel: + xchk_trans_cancel(xqc->sc); + + /* Re-lock the dquot so the caller can put the reference. */ + xfs_dqlock(dq); + return error; +} + +/* Commit new quota counters for a particular quota type. */ +STATIC int +xqcheck_commit_dqtype( + struct xqcheck *xqc, + unsigned int dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xqcheck_dquot xcdq; + struct xfs_scrub *sc = xqc->sc; + struct xfs_mount *mp = sc->mp; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + struct xfs_dquot *dq; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + /* + * Update the counters of every dquot that the quota file knows about. + */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* + * Make a second pass to deal with the dquots that we know about but + * the quota file previously did not know about. + */ + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + /* + * Grab the dquot, allowing for dquot block allocation in a + * separate transaction. We committed the scrub transaction + * in a previous step, so we will not be creating nested + * transactions here. + */ + error = xfs_qm_dqget(mp, id, dqtype, true, &dq); + if (error) + return error; + + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Figure out quota CHKD flags for the running quota types. */ +static inline unsigned int +xqcheck_chkd_flags( + struct xfs_mount *mp) +{ + unsigned int ret = 0; + + if (XFS_IS_UQUOTA_ON(mp)) + ret |= XFS_UQUOTA_CHKD; + if (XFS_IS_GQUOTA_ON(mp)) + ret |= XFS_GQUOTA_CHKD; + if (XFS_IS_PQUOTA_ON(mp)) + ret |= XFS_PQUOTA_CHKD; + return ret; +} + +/* Commit the new dquot counters. */ +int +xrep_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + unsigned int qflags = xqcheck_chkd_flags(sc->mp); + int error; + + /* + * Clear the CHKD flag for the running quota types and commit the scrub + * transaction so that we can allocate new quota block mappings if we + * have to. If we crash after this point, the sb still has the CHKD + * flags cleared, so mount quotacheck will fix all of this up. + */ + xrep_update_qflags(sc, qflags, 0); + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Commit the new counters to the dquots. */ + if (xqc->ucounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER); + if (error) + return error; + } + if (xqc->gcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP); + if (error) + return error; + } + if (xqc->pcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ); + if (error) + return error; + } + + /* Set the CHKD flags now that we've fixed quota counts. */ + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + xrep_update_qflags(sc, 0, qflags); + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c new file mode 100644 index 0000000000..e1e52bc207 --- /dev/null +++ b/fs/xfs/scrub/rcbag.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/rcbag_btree.h" +#include "scrub/rcbag.h" +#include "scrub/trace.h" + +struct rcbag { + struct xfs_mount *mp; + struct xfbtree xfbtree; + uint64_t nr_items; +}; + +int +rcbag_init( + struct xfs_mount *mp, + struct xfs_buftarg *btp, + struct rcbag **bagp) +{ + struct rcbag *bag; + int error; + + bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS); + if (!bag) + return -ENOMEM; + + bag->nr_items = 0; + bag->mp = mp; + + error = rcbagbt_mem_init(mp, &bag->xfbtree, btp); + if (error) + goto out_bag; + + *bagp = bag; + return 0; + +out_bag: + kfree(bag); + return error; +} + +void +rcbag_free( + struct rcbag **bagp) +{ + struct rcbag *bag = *bagp; + + xfbtree_destroy(&bag->xfbtree); + kfree(bag); + *bagp = NULL; +} + +/* Track an rmap in the refcount bag. */ +int +rcbag_add( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = rcbagbt_lookup_eq(cur, rmap, &has); + if (error) + goto out_cur; + + if (has) { + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bagrec.rbg_refcount++; + error = rcbagbt_update(cur, &bagrec); + if (error) + goto out_cur; + } else { + bagrec.rbg_startblock = rmap->rm_startblock; + bagrec.rbg_blockcount = rmap->rm_blockcount; + bagrec.rbg_refcount = 1; + + error = rcbagbt_insert(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + } + + xfs_btree_del_cursor(cur, 0); + + error = xfbtree_trans_commit(&bag->xfbtree, tp); + if (error) + return error; + + bag->nr_items++; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Return the number of records in the bag. */ +uint64_t +rcbag_count( + const struct rcbag *rcbag) +{ + return rcbag->nr_items; +} + +static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r) +{ + return r->rbg_startblock + r->rbg_blockcount; +} + +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +int +rcbag_next_edge( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, + bool next_valid, + uint32_t *next_bnop) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + uint32_t next_bno = NULLAGBLOCK; + int has; + int error; + + if (next_valid) + next_bno = next_rmap->rm_startblock; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec)); + } + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (next_bno == NULLAGBLOCK) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_btree_del_cursor(cur, 0); + + *next_bnop = next_bno; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Pop all refcount bag records that end at next_bno */ +int +rcbag_remove_ending_at( + struct rcbag *bag, + struct xfs_trans *tp, + uint32_t next_bno) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + /* go to the right edge of the tree */ + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_decrement(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + if (rcbag_rec_next_bno(&bagrec) != next_bno) + continue; + + error = xfs_btree_delete(cur, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bag->nr_items -= bagrec.rbg_refcount; + } + + xfs_btree_del_cursor(cur, 0); + return xfbtree_trans_commit(&bag->xfbtree, tp); +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Dump the rcbag. */ +void +rcbag_dump( + struct rcbag *bag, + struct xfs_trans *tp) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + unsigned long long nr = 0; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n", + nr++, + (unsigned int)bagrec.rbg_startblock, + (unsigned int)bagrec.rbg_blockcount, + (unsigned long long)bagrec.rbg_refcount); + } + +out_cur: + xfs_btree_del_cursor(cur, error); +} diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h new file mode 100644 index 0000000000..e29ef788ba --- /dev/null +++ b/fs/xfs/scrub/rcbag.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_H__ +#define __XFS_SCRUB_RCBAG_H__ + +struct xfs_mount; +struct rcbag; +struct xfs_buftarg; + +int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp, + struct rcbag **bagp); +void rcbag_free(struct rcbag **bagp); +int rcbag_add(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap); +uint64_t rcbag_count(const struct rcbag *bag); + +int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, bool next_valid, + uint32_t *next_bnop); +int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp, + uint32_t next_bno); + +void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp); + +#endif /* __XFS_SCRUB_RCBAG_H__ */ diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c new file mode 100644 index 0000000000..709356dc62 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/rcbag_btree.h" +#include "scrub/trace.h" + +static struct kmem_cache *rcbagbt_cur_cache; + +STATIC void +rcbagbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + struct rcbag_key *bag_key = (struct rcbag_key *)key; + const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec; + + BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key)); + BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec)); + + bag_key->rbg_startblock = bag_rec->rbg_startblock; + bag_key->rbg_blockcount = bag_rec->rbg_blockcount; +} + +STATIC void +rcbagbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec; + struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec; + + bag_rec->rbg_startblock = bag_irec->rbg_startblock; + bag_rec->rbg_blockcount = bag_irec->rbg_blockcount; + bag_rec->rbg_refcount = bag_irec->rbg_refcount; +} + +STATIC int64_t +rcbagbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + const struct rcbag_key *kp = (const struct rcbag_key *)key; + + if (kp->rbg_startblock > rec->rbg_startblock) + return 1; + if (kp->rbg_startblock < rec->rbg_startblock) + return -1; + + if (kp->rbg_blockcount > rec->rbg_blockcount) + return 1; + if (kp->rbg_blockcount < rec->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int64_t +rcbagbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + ASSERT(mask == NULL); + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 1; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return -1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 1; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int +rcbagbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 0; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return 1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 0; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return 1; + + return 0; +} + +STATIC int +rcbagbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1; + const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2; + + if (rp1->rbg_startblock > rp2->rbg_startblock) + return 0; + if (rp1->rbg_startblock < rp2->rbg_startblock) + return 1; + + if (rp1->rbg_blockcount > rp2->rbg_blockcount) + return 0; + if (rp1->rbg_blockcount < rp2->rbg_blockcount) + return 1; + + return 0; +} + +static xfs_failaddr_t +rcbagbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= rcbagbt_maxlevels_possible()) + return __this_address; + + maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +rcbagbt_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = rcbagbt_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops rcbagbt_mem_buf_ops = { + .name = "rcbagbt_mem", + .magic = { 0, cpu_to_be32(RCBAG_MAGIC) }, + .verify_read = rcbagbt_rw_verify, + .verify_write = rcbagbt_rw_verify, + .verify_struct = rcbagbt_verify, +}; + +static const struct xfs_btree_ops rcbagbt_mem_ops = { + .name = "rcbag", + .type = XFS_BTREE_TYPE_MEM, + + .rec_len = sizeof(struct rcbag_rec), + .key_len = sizeof(struct rcbag_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = 1, + .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = rcbagbt_init_key_from_rec, + .init_rec_from_cur = rcbagbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = rcbagbt_key_diff, + .buf_ops = &rcbagbt_mem_buf_ops, + .diff_two_keys = rcbagbt_diff_two_keys, + .keys_inorder = rcbagbt_keys_inorder, + .recs_inorder = rcbagbt_recs_inorder, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +rcbagbt_mem_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfbtree *xfbtree) +{ + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops, + rcbagbt_maxlevels_possible(), rcbagbt_cur_cache); + + cur->bc_mem.xfbtree = xfbtree; + cur->bc_nlevels = xfbtree->nlevels; + return cur; +} + +/* Create an in-memory refcount bag btree. */ +int +rcbagbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp) +{ + xfbt->owner = 0; + return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops); +} + +/* Calculate number of records in a refcount bag btree block. */ +static inline unsigned int +rcbagbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct rcbag_rec); + return blocklen / + (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t)); +} + +/* + * Calculate number of records in an refcount bag btree block. + */ +unsigned int +rcbagbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= RCBAG_BLOCK_LEN; + return rcbagbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for refcount bag btrees. */ +unsigned int +rcbagbt_maxlevels_possible(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_space_to_height(minrecs, ULLONG_MAX); +} + +/* Calculate the refcount bag btree size for some records. */ +unsigned long long +rcbagbt_calc_size( + unsigned long long nr_records) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_calc_size(minrecs, nr_records); +} + +int __init +rcbagbt_init_cur_cache(void) +{ + rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur", + xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()), + 0, 0, NULL); + + if (!rcbagbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +rcbagbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(rcbagbt_cur_cache); + rcbagbt_cur_cache = NULL; +} + +/* Look up the refcount bag record corresponding to this reverse mapping. */ +int +rcbagbt_lookup_eq( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, + int *success) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + + rec->rbg_startblock = rmap->rm_startblock; + rec->rbg_blockcount = rmap->rm_blockcount; + + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success); +} + +/* Get the data from the pointed-to record. */ +int +rcbagbt_get_rec( + struct xfs_btree_cur *cur, + struct rcbag_rec *rec, + int *has) +{ + union xfs_btree_rec *btrec; + int error; + + error = xfs_btree_get_rec(cur, &btrec, has); + if (error || !(*has)) + return error; + + memcpy(rec, btrec, sizeof(struct rcbag_rec)); + return 0; +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_update( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec) +{ + union xfs_btree_rec btrec; + + memcpy(&btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_update(cur, &btrec); +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_insert( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec, + int *success) +{ + struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec; + + memcpy(btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_insert(cur, success); +} diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h new file mode 100644 index 0000000000..03cadb0325 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_RCBAG_BTREE_H__ +#define __XFS_SCRUB_RCBAG_BTREE_H__ + +#ifdef CONFIG_XFS_BTREE_IN_MEM + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */ + +struct rcbag_key { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; +}; + +struct rcbag_rec { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; + uint64_t rbg_refcount; +}; + +typedef __be64 rcbag_ptr_t; + +/* reflinks only exist on crc enabled filesystems */ +#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define RCBAG_REC_ADDR(block, index) \ + ((struct rcbag_rec *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct rcbag_rec)))) + +#define RCBAG_KEY_ADDR(block, index) \ + ((struct rcbag_key *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct rcbag_key))) + +#define RCBAG_PTR_ADDR(block, index, maxrecs) \ + ((rcbag_ptr_t *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (maxrecs) * sizeof(struct rcbag_key) + \ + ((index) - 1) * sizeof(rcbag_ptr_t))) + +unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); + +unsigned long long rcbagbt_calc_size(unsigned long long nr_records); + +unsigned int rcbagbt_maxlevels_possible(void); + +int __init rcbagbt_init_cur_cache(void); +void rcbagbt_destroy_cur_cache(void); + +struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp); + +int rcbagbt_lookup_eq(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, int *success); +int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has); +int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec); +int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec, + int *success); + +#else +# define rcbagbt_init_cur_cache() 0 +# define rcbagbt_destroy_cur_cache() ((void)0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */ diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index 16462332c8..dfdcb96b6c 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -281,7 +281,7 @@ xchk_dir_walk( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); @@ -332,7 +332,7 @@ xchk_dir_lookup( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { error = xfs_dir2_sf_lookup(&args); diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index f99eca7998..0252a3b5b6 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -114,7 +114,7 @@ xreap_put_freelist( int error; /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); + error = xrep_fix_freelist(sc, 0); if (error) return error; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index bf22f245bb..d0c7d4a29c 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,8 +7,10 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" @@ -17,6 +19,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* * Set us up to scrub reference count btrees. @@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt( { if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_refcountbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index f38fccc42a..a00d7ce7ae 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -25,6 +25,7 @@ #include "xfs_refcount_btree.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_health.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -37,6 +38,7 @@ #include "scrub/xfarray.h" #include "scrub/newbt.h" #include "scrub/reap.h" +#include "scrub/rcbag.h" /* * Rebuilding the Reference Count Btree @@ -97,12 +99,6 @@ * insert all the records. */ -/* The only parts of the rmap that we care about for computing refcounts. */ -struct xrep_refc_rmap { - xfs_agblock_t startblock; - xfs_extlen_t blockcount; -} __packed; - struct xrep_refc { /* refcount extents */ struct xfarray *refcount_records; @@ -122,6 +118,20 @@ struct xrep_refc { xfs_extlen_t btblocks; }; +/* Set us up to repair refcount btrees. */ +int +xrep_setup_ag_refcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + /* Check for any obvious conflicts with this shared/CoW staging extent. */ STATIC int xrep_refc_check_ext( @@ -223,10 +233,9 @@ xrep_refc_rmap_shareable( STATIC int xrep_refc_walk_rmaps( struct xrep_refc *rr, - struct xrep_refc_rmap *rrm, + struct xfs_rmap_irec *rmap, bool *have_rec) { - struct xfs_rmap_irec rmap; struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; struct xfs_mount *mp = cur->bc_mp; int have_gt; @@ -250,29 +259,30 @@ xrep_refc_walk_rmaps( if (!have_gt) return 0; - error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + error = xfs_rmap_get_rec(cur, rmap, &have_gt); if (error) return error; - if (XFS_IS_CORRUPT(mp, !have_gt)) + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } - if (rmap.rm_owner == XFS_RMAP_OWN_COW) { - error = xrep_refc_stash_cow(rr, rmap.rm_startblock, - rmap.rm_blockcount); + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); if (error) return error; - } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) { /* refcountbt block, dump it when we're done. */ - rr->btblocks += rmap.rm_blockcount; + rr->btblocks += rmap->rm_blockcount; error = xagb_bitmap_set(&rr->old_refcountbt_blocks, - rmap.rm_startblock, rmap.rm_blockcount); + rmap->rm_startblock, + rmap->rm_blockcount); if (error) return error; } - } while (!xrep_refc_rmap_shareable(mp, &rmap)); + } while (!xrep_refc_rmap_shareable(mp, rmap)); - rrm->startblock = rmap.rm_startblock; - rrm->blockcount = rmap.rm_blockcount; *have_rec = true; return 0; } @@ -354,45 +364,6 @@ xrep_refc_sort_records( return error; } -#define RRM_NEXT(r) ((r).startblock + (r).blockcount) -/* - * Find the next block where the refcount changes, given the next rmap we - * looked at and the ones we're already tracking. - */ -static inline int -xrep_refc_next_edge( - struct xfarray *rmap_bag, - struct xrep_refc_rmap *next_rrm, - bool next_valid, - xfs_agblock_t *nbnop) -{ - struct xrep_refc_rmap rrm; - xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; - xfs_agblock_t nbno = NULLAGBLOCK; - int error; - - if (next_valid) - nbno = next_rrm->startblock; - - while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) - nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); - - if (error) - return error; - - /* - * We should have found /something/ because either next_rrm is the next - * interesting rmap to look at after emitting this refcount extent, or - * there are other rmaps in rmap_bag contributing to the current - * sharing count. But if something is seriously wrong, bail out. - */ - if (nbno == NULLAGBLOCK) - return -EFSCORRUPTED; - - *nbnop = nbno; - return 0; -} - /* * Walk forward through the rmap btree to collect all rmaps starting at * @bno in @rmap_bag. These represent the file(s) that share ownership of @@ -402,22 +373,21 @@ xrep_refc_next_edge( static int xrep_refc_push_rmaps_at( struct xrep_refc *rr, - struct xfarray *rmap_bag, + struct rcbag *rcstack, xfs_agblock_t bno, - struct xrep_refc_rmap *rrm, - bool *have, - uint64_t *stack_sz) + struct xfs_rmap_irec *rmap, + bool *have) { struct xfs_scrub *sc = rr->sc; int have_gt; int error; - while (*have && rrm->startblock == bno) { - error = xfarray_store_anywhere(rmap_bag, rrm); + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); if (error) return error; - (*stack_sz)++; - error = xrep_refc_walk_rmaps(rr, rrm, have); + + error = xrep_refc_walk_rmaps(rr, rmap, have); if (error) return error; } @@ -425,8 +395,10 @@ xrep_refc_push_rmaps_at( error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); if (error) return error; - if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sa.rmap_cur); return -EFSCORRUPTED; + } return 0; } @@ -436,12 +408,9 @@ STATIC int xrep_refc_find_refcounts( struct xrep_refc *rr) { - struct xrep_refc_rmap rrm; struct xfs_scrub *sc = rr->sc; - struct xfarray *rmap_bag; - char *descr; - uint64_t old_stack_sz; - uint64_t stack_sz = 0; + struct rcbag *rcstack; + uint64_t old_stack_height; xfs_agblock_t sbno; xfs_agblock_t cbno; xfs_agblock_t nbno; @@ -451,14 +420,11 @@ xrep_refc_find_refcounts( xrep_ag_btcur_init(sc, &sc->sa); /* - * Set up a sparse array to store all the rmap records that we're - * tracking to generate a reference count record. If this exceeds + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If the size of the bag exceeds * MAXREFCOUNT, we clamp rc_refcount. */ - descr = xchk_xfile_ag_descr(sc, "rmap record bag"); - error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), - &rmap_bag); - kfree(descr); + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) goto out_cur; @@ -469,62 +435,54 @@ xrep_refc_find_refcounts( /* Process reverse mappings into refcount data. */ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + struct xfs_rmap_irec rmap; + /* Push all rmaps with pblk == sbno onto the stack */ - error = xrep_refc_walk_rmaps(rr, &rrm, &have); + error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (!have) break; - sbno = cbno = rrm.startblock; - error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, - &rrm, &have, &stack_sz); + sbno = cbno = rmap.rm_startblock; + error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); if (error) goto out_bag; /* Set nbno to the bno of the next refcount change */ - error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); if (error) goto out_bag; ASSERT(nbno > sbno); - old_stack_sz = stack_sz; + old_stack_height = rcbag_count(rcstack); /* While stack isn't empty... */ - while (stack_sz) { - xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; - + while (rcbag_count(rcstack) > 0) { /* Pop all rmaps that end at nbno */ - while ((error = xfarray_iter(rmap_bag, &array_cur, - &rrm)) == 1) { - if (RRM_NEXT(rrm) != nbno) - continue; - error = xfarray_unset(rmap_bag, array_cur - 1); - if (error) - goto out_bag; - stack_sz--; - } + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); if (error) goto out_bag; /* Push array items that start at nbno */ - error = xrep_refc_walk_rmaps(rr, &rrm, &have); + error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (have) { - error = xrep_refc_push_rmaps_at(rr, rmap_bag, - nbno, &rrm, &have, &stack_sz); + error = xrep_refc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); if (error) goto out_bag; } /* Emit refcount if necessary */ ASSERT(nbno > cbno); - if (stack_sz != old_stack_sz) { - if (old_stack_sz > 1) { + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { error = xrep_refc_stash(rr, XFS_REFC_DOMAIN_SHARED, cbno, nbno - cbno, - old_stack_sz); + old_stack_height); if (error) goto out_bag; } @@ -532,13 +490,13 @@ xrep_refc_find_refcounts( } /* Stack empty, go find the next rmap */ - if (stack_sz == 0) + if (rcbag_count(rcstack) == 0) break; - old_stack_sz = stack_sz; + old_stack_height = rcbag_count(rcstack); sbno = nbno; /* Set nbno to the bno of the next refcount change */ - error = xrep_refc_next_edge(rmap_bag, &rrm, have, + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); if (error) goto out_bag; @@ -547,14 +505,13 @@ xrep_refc_find_refcounts( } } - ASSERT(stack_sz == 0); + ASSERT(rcbag_count(rcstack) == 0); out_bag: - xfarray_destroy(rmap_bag); + rcbag_free(&rcstack); out_cur: xchk_ag_btcur_free(&sc->sa); return error; } -#undef RRM_NEXT /* Retrieve refcountbt data for bulk load. */ STATIC int @@ -653,8 +610,8 @@ xrep_refc_build_new_tree( rr->new_btree.bload.claim_block = xrep_refc_claim_block; /* Compute how many blocks we'll need. */ - refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, - pag); + refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake); error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, xfarray_length(rr->refcount_records)); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 745d5b8f40..f43dce771c 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -30,12 +30,15 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_reflink.h" +#include "xfs_health.h" +#include "xfs_buf_mem.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/stats.h" +#include "scrub/xfile.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -400,7 +403,7 @@ xrep_calc_ag_resblks( int xrep_fix_freelist( struct xfs_scrub *sc, - bool can_shrink) + int alloc_flags) { struct xfs_alloc_arg args = {0}; @@ -410,8 +413,7 @@ xrep_fix_freelist( args.alignment = 1; args.pag = sc->sa.pag; - return xfs_alloc_fix_freelist(&args, - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); + return xfs_alloc_fix_freelist(&args, alloc_flags); } /* @@ -687,6 +689,44 @@ xrep_find_ag_btree_roots( } #ifdef CONFIG_XFS_QUOTA +/* Update some quota flags in the superblock. */ +void +xrep_update_qflags( + struct xfs_scrub *sc, + unsigned int clear_flags, + unsigned int set_flags) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + if ((mp->m_qflags & clear_flags) == 0 && + (mp->m_qflags & set_flags) == set_flags) + goto no_update; + + mp->m_qflags &= ~clear_flags; + mp->m_qflags |= set_flags; + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags &= ~clear_flags; + mp->m_sb.sb_qflags |= set_flags; + spin_unlock(&mp->m_sb_lock); + + /* + * Update the quota flags in the ondisk superblock without touching + * the summary counters. We have not quiesced inode chunk allocation, + * so we cannot coordinate with updates to the icount and ifree percpu + * counters. + */ + bp = xfs_trans_getsb(sc->tp); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); + +no_update: + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); +} + /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -699,13 +739,7 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->mp->m_qflags &= ~flag; - spin_lock(&sc->mp->m_sb_lock); - sc->mp->m_sb.sb_qflags &= ~flag; - spin_unlock(&sc->mp->m_sb_lock); - xfs_log_sb(sc->tp); - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + xrep_update_qflags(sc, flag, 0); } /* @@ -799,20 +833,20 @@ xrep_ag_btcur_init( /* Set up a bnobt cursor for cross-referencing. */ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); } /* Set up a inobt cursor for cross-referencing. */ if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sa->agi_bp, XFS_BTNUM_INO); + sa->agi_bp); if (xfs_has_finobt(mp)) - sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, - sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp); } /* Set up a rmapbt cursor for cross-referencing. */ @@ -1115,3 +1149,55 @@ xrep_metadata_inode_forks( return 0; } + +/* + * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating + * a shmem file might take loks, so we cannot be in transaction context. Park + * our resources in the scrub context and let the teardown function take care + * of them at the right time. + */ +int +xrep_setup_xfbtree( + struct xfs_scrub *sc, + const char *descr) +{ + ASSERT(sc->tp == NULL); + + return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); +} + +/* + * Create a dummy transaction for use in a live update hook function. This + * function MUST NOT be called from regular repair code because the current + * process' transaction is saved via the cookie. + */ +int +xrep_trans_alloc_hook_dummy( + struct xfs_mount *mp, + void **cookiep, + struct xfs_trans **tpp) +{ + int error; + + *cookiep = current->journal_info; + current->journal_info = NULL; + + error = xfs_trans_alloc_empty(mp, tpp); + if (!error) + return 0; + + current->journal_info = *cookiep; + *cookiep = NULL; + return error; +} + +/* Cancel a dummy transaction used by a live update hook function. */ +void +xrep_trans_cancel_hook_dummy( + void **cookiep, + struct xfs_trans *tp) +{ + xfs_trans_cancel(tp); + current->journal_info = *cookiep; + *cookiep = NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 17114327e6..ce082d9414 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -51,7 +51,7 @@ struct xbitmap; struct xagb_bitmap; struct xfsb_bitmap; -int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); +int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -72,6 +72,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); #ifdef CONFIG_XFS_QUOTA +void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags, + unsigned int set_flags); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); #else @@ -79,11 +81,15 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); # define xrep_ino_dqattach(sc) (0) #endif /* CONFIG_XFS_QUOTA */ +int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr); + int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); +int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); +int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -109,11 +115,14 @@ int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_rmapbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); +int xrep_nlinks(struct xfs_scrub *sc); +int xrep_fscounters(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); @@ -123,13 +132,19 @@ int xrep_rtbitmap(struct xfs_scrub *sc); #ifdef CONFIG_XFS_QUOTA int xrep_quota(struct xfs_scrub *sc); +int xrep_quotacheck(struct xfs_scrub *sc); #else # define xrep_quota xrep_notsupported +# define xrep_quotacheck xrep_notsupported #endif /* CONFIG_XFS_QUOTA */ int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); +int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, + struct xfs_trans **tpp); +void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); + #else #define xrep_ino_dqattach(sc) (0) @@ -171,6 +186,8 @@ xrep_setup_nothing( return 0; } #define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_ag_rmapbt xrep_setup_nothing +#define xrep_setup_ag_refcountbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) @@ -184,6 +201,7 @@ xrep_setup_nothing( #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported +#define xrep_rmapbt xrep_notsupported #define xrep_refcountbt xrep_notsupported #define xrep_inode xrep_notsupported #define xrep_bmap_data xrep_notsupported @@ -191,6 +209,9 @@ xrep_setup_nothing( #define xrep_bmap_cow xrep_notsupported #define xrep_rtbitmap xrep_notsupported #define xrep_quota xrep_notsupported +#define xrep_quotacheck xrep_notsupported +#define xrep_nlinks xrep_notsupported +#define xrep_fscounters xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index c99d1714f2..ba5bbc3fb7 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -25,6 +25,7 @@ #include "scrub/btree.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/repair.h" /* * Set us up to scrub reverse mapping btrees. @@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt( if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_rmapbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -349,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur, &irec) != NULL) { + xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -412,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ cur = sc->sa.bno_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.bno_cur) xfs_btree_del_cursor(cur, error); @@ -422,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata( cur = sc->sa.cnt_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.cnt_cur) xfs_btree_del_cursor(cur, error); @@ -447,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_INOBT: inobt, finobt */ cur = sc->sa.ino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, - XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.ino_cur) xfs_btree_del_cursor(cur, error); @@ -458,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata( if (xfs_has_finobt(sc->mp)) { cur = sc->sa.fino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sc->sa.agi_bp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.fino_cur) xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c new file mode 100644 index 0000000000..e8e07b683e --- /dev/null +++ b/fs/xfs/scrub/rmap_repair.c @@ -0,0 +1,1697 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Reverse Mapping Btree Repair + * ============================ + * + * This is the most involved of all the AG space btree rebuilds. Everywhere + * else in XFS we lock inodes and then AG data structures, but generating the + * list of rmap records requires that we be able to scan both block mapping + * btrees of every inode in the filesystem to see if it owns any extents in + * this AG. We can't tolerate any inode updates while we do this, so we + * freeze the filesystem to lock everyone else out, and grant ourselves + * special privileges to run transactions with regular background reclamation + * turned off. + * + * We also have to be very careful not to allow inode reclaim to start a + * transaction because all transactions (other than our own) will block. + * Deferred inode inactivation helps us out there. + * + * I) Reverse mappings for all non-space metadata and file data are collected + * according to the following algorithm: + * + * 1. For each fork of each inode: + * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. + * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate + * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree + * block. + * 1.3. If the incore extent map is loaded but the fork is in btree format, + * just visit the bmbt blocks to set the corresponding BMBIT areas. + * 1.4. From the incore extent map, accumulate each bmap that falls into our + * target AG. Remember, multiple bmap records can map to a single rmap + * record, so we cannot simply emit rmap records 1:1. + * 1.5. Emit rmap records for each extent in BMBIT and free it. + * 2. Create bitmaps INOBIT and ICHUNKBIT. + * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, + * and set bits in INOBIT for each btree block. If the inobt has no records + * at all, we must be careful to record its root in INOBIT. + * 4. For each block in the finobt, set the corresponding INOBIT area. + * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. + * 6. Create bitmaps REFCBIT and COWBIT. + * 7. For each CoW staging extent in the refcountbt, set the corresponding + * areas in COWBIT. + * 8. For each block in the refcountbt, set the corresponding REFCBIT area. + * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. + * A. Emit rmap for the AG headers. + * B. Emit rmap for the log, if there is one. + * + * II) The rmapbt shape and space metadata rmaps are computed as follows: + * + * 1. Count the rmaps collected in the previous step. (= NR) + * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) + * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. + * 4. Create bitmap AGBIT. + * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. + * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. + * 7. Count the extents in AGBIT. (= AGNR) + * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') + * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', + * and clear AGBIT. Go to step 5. + * A. Emit rmaps for each extent in AGBIT. + * + * III) The rmapbt is constructed and set in place as follows: + * + * 1. Sort the rmap records. + * 2. Bulk load the rmaps. + * + * IV) Reap the old btree blocks. + * + * 1. Create a bitmap OLDRMBIT. + * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. + * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. + * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are + * the parts of the AG that the rmap didn't find during its scan of the + * primary metadata and aren't known to be in the free space, which implies + * that they were the old rmapbt blocks. + * 5. Commit. + * + * We use the 'xrep_rmap' prefix for all the rmap functions. + */ + +/* Context for collecting rmaps */ +struct xrep_rmap { + /* new rmapbt information */ + struct xrep_newbt new_btree; + + /* lock for the xfbtree and xfile */ + struct mutex lock; + + /* rmap records generated from primary metadata */ + struct xfbtree rmap_btree; + + struct xfs_scrub *sc; + + /* in-memory btree cursor for the xfs_btree_bload iteration */ + struct xfs_btree_cur *mcur; + + /* Hooks into rmap update code. */ + struct xfs_rmap_hook rhook; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* Number of non-freespace records found. */ + unsigned long long nr_records; + + /* bnobt/cntbt contribution to btreeblks */ + xfs_agblock_t freesp_btblocks; + + /* old agf_rmap_blocks counter */ + unsigned int old_rmapbt_fsbcount; +}; + +/* Set us up to repair reverse mapping btrees. */ +int +xrep_setup_ag_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr; + char *descr; + int error; + + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; + + rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rmap_stash( + struct xrep_rmap *rr, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); + return error; +} + +struct xrep_rmap_stash_run { + struct xrep_rmap *rr; + uint64_t owner; + unsigned int rmap_flags; +}; + +static int +xrep_rmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rmap_stash_run *rsr = priv; + struct xrep_rmap *rr = rsr->rr; + + return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rmap_stash_bitmap( + struct xrep_rmap *rr, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + .rmap_flags = 0, + }; + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; + + return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); +} + +/* Section (I): Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + /* Bitmap of bmbt blocks in this AG. */ + struct xagb_bitmap bmbt_blocks; + + struct xrep_rmap *rr; + + /* Which inode fork? */ + int whichfork; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rmap_stash_accumulated( + struct xrep_rmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_mount *mp = rf->rr->sc->mp; + struct xfs_rmap_irec *accum = &rf->accum; + xfs_agblock_t agbno; + unsigned int rmap_flags = 0; + int error; + + if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != + rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); + if (rf->whichfork == XFS_ATTR_FORK) + rmap_flags |= XFS_RMAP_ATTR_FORK; + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + agbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = agbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* Add a btree block to the bitmap. */ +STATIC int +xrep_rmap_visit_iroot_btree_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); +} + +/* + * Iterate a metadata btree rooted in an inode to collect rmap records for + * anything in this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iroot_btree( + struct xrep_rmap_ifork *rf, + struct xfs_btree_cur *cur) +{ + struct xfs_owner_info oinfo; + struct xrep_rmap *rr = rf->rr; + int error; + + xagb_bitmap_init(&rf->bmbt_blocks); + + /* Record all the blocks in the btree itself. */ + error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, + XFS_BTREE_VISIT_ALL, rf); + if (error) + goto out; + + /* Emit rmaps for the btree blocks. */ + xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); + error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); + if (error) + goto out; + + /* Stash any remaining accumulated rmaps. */ + error = xrep_rmap_stash_accumulated(rf); +out: + xagb_bitmap_destroy(&rf->bmbt_blocks); + return error; +} + +static inline bool +is_rt_data_fork( + struct xfs_inode *ip, + int whichfork) +{ + return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that matches the AG. Sets @mappings_done to true if we've scanned the + * block mappings in this fork. + */ +STATIC int +xrep_rmap_scan_bmbt( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp; + int error; + + *mappings_done = false; + ifp = xfs_ifork_ptr(ip, rf->whichfork); + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); + + if (!xfs_ifork_is_realtime(ip, rf->whichfork) && + xfs_need_iread_extents(ifp)) { + /* + * If the incore extent cache isn't loaded, scan the bmbt for + * mapping records. This avoids loading the incore extent + * tree, which will increase memory pressure at a time when + * we're trying to run as quickly as we possibly can. Ignore + * realtime extents. + */ + error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); + if (error) + goto out_cur; + + *mappings_done = true; + } + + /* Scan for the bmbt blocks, which always live on the data device. */ + error = xrep_rmap_scan_iroot_btree(rf, cur); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iext( + struct xrep_rmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rmap_stash_accumulated(rf); +} + +/* Find all the extents from a given AG in an inode fork. */ +STATIC int +xrep_rmap_scan_ifork( + struct xrep_rmap *rr, + struct xfs_inode *ip, + int whichfork) +{ + struct xrep_rmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + .whichfork = whichfork, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int error = 0; + + if (!ifp) + return 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmap btree for data device mappings. This includes + * the btree blocks themselves, even if this is a realtime + * file. + */ + error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + return 0; + } + + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded + * attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_rmap_scan_ilock( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rmap_scan_inode( + struct xrep_rmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode = 0; + int error; + + /* + * Directory updates (create/link/unlink/rename) drop the directory's + * ILOCK before finishing any rmapbt updates associated with directory + * shape changes. For this scan to coordinate correctly with the live + * update hook, we must take the only lock (i_rwsem) that is held all + * the way to dir op completion. This will get fixed by the parent + * pointer patchset. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + lock_mode = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + } + lock_mode |= xrep_rmap_scan_ilock(ip); + + /* Check the data fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* Check the attr fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + + /* COW fork extents are "owned" by the refcount btree. */ + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Section (I): Find all AG metadata extents except for free space metadata. */ + +struct xrep_rmap_inodes { + struct xrep_rmap *rr; + struct xagb_bitmap inobt_blocks; /* INOBIT */ + struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ +}; + +/* Record inode btree rmaps. */ +STATIC int +xrep_rmap_walk_inobt( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_rmap_inodes *ri = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agino_t agino; + xfs_agino_t iperhole; + unsigned int i; + int error; + + /* Record the inobt blocks. */ + error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); + if (error) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + return -EFSCORRUPTED; + + agino = irec.ir_startino; + + /* Record a non-sparse inode chunk. */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + aglen = max_t(xfs_extlen_t, 1, + XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); + + return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + } + + /* Iterate each chunk. */ + iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, + XFS_INODES_PER_HOLEMASK_BIT); + aglen = iperhole / mp->m_sb.sb_inopblock; + for (i = 0, agino = irec.ir_startino; + i < XFS_INOBT_HOLEMASK_BITS; + i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { + /* Skip holes. */ + if (irec.ir_holemask & (1 << i)) + continue; + + /* Record the inode chunk otherwise. */ + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + if (error) + return error; + } + + return 0; +} + +/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ +STATIC int +xrep_rmap_find_inode_rmaps( + struct xrep_rmap *rr) +{ + struct xrep_rmap_inodes ri = { + .rr = rr, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + xagb_bitmap_init(&ri.inobt_blocks); + xagb_bitmap_init(&ri.ichunk_blocks); + + /* + * Iterate every record in the inobt so we can capture all the inode + * chunks and the blocks in the inobt itself. + */ + error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); + if (error) + goto out_bitmap; + + /* + * Note that if there are zero records in the inobt then query_all does + * nothing and we have to account the empty inobt root manually. + */ + if (xagb_bitmap_empty(&ri.ichunk_blocks)) { + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + + error = xagb_bitmap_set(&ri.inobt_blocks, + be32_to_cpu(agi->agi_root), 1); + if (error) + goto out_bitmap; + } + + /* Scan the finobt too. */ + if (xfs_has_finobt(sc->mp)) { + error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, + sc->sa.fino_cur); + if (error) + goto out_bitmap; + } + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, + &XFS_RMAP_OINFO_INOBT); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, + &XFS_RMAP_OINFO_INODES); + +out_bitmap: + xagb_bitmap_destroy(&ri.inobt_blocks); + xagb_bitmap_destroy(&ri.ichunk_blocks); + return error; +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rmap_find_refcount_rmaps( + struct xrep_rmap *rr) +{ + struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ + struct xagb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_reflink(sc->mp)) + return 0; + + xagb_bitmap_init(&refcountbt_blocks); + xagb_bitmap_init(&cow_blocks); + + /* refcountbt */ + error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); + if (error) + goto out_bitmap; + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, + xrep_rmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, + &XFS_RMAP_OINFO_REFC); + +out_bitmap: + xagb_bitmap_destroy(&cow_blocks); + xagb_bitmap_destroy(&refcountbt_blocks); + return error; +} + +/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ +STATIC int +xrep_rmap_find_agheader_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + /* Create a record for the AG sb->agfl. */ + return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), + XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, + XFS_RMAP_OWN_FS, 0, 0); +} + +/* Generate rmaps for the log, if it's in this AG. */ +STATIC int +xrep_rmap_find_log_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + return 0; + + return xrep_rmap_stash(rr, + XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), + sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); +} + +/* Check and count all the records that we gathered. */ +STATIC int +xrep_rmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap *rr = priv; + int error; + + error = xrep_rmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + +/* + * Generate all the reverse-mappings for this AG, a list of the old rmapbt + * blocks, and the new btreeblks count. Figure out if we have enough free + * space to reconstruct the inode btrees. The caller must clean up the lists + * if anything goes wrong. This implements section (I) above. + */ +STATIC int +xrep_rmap_find_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xchk_ag *sa = &sc->sa; + struct xfs_inode *ip; + struct xfs_btree_cur *mcur; + int error; + + /* Find all the per-AG metadata. */ + xrep_ag_btcur_init(sc, &sc->sa); + + error = xrep_rmap_find_inode_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_refcount_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_agheader_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_log_rmaps(rr); +end_agscan: + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the AG header buffers and cancel the transaction to release + * the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + sa->agf_bp = NULL; + sa->agi_bp = NULL; + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* Iterate all AGs for inodes rmaps. */ + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the AG headers in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_fs(sc); + if (error) + return error; + error = xchk_perag_drain_and_lock(sc); + if (error) + return error; + + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned space in the filesystem. At the same time, check + * all our records before we start building a new btree, which requires + * a bnobt cursor. + */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); + + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + xfs_btree_del_cursor(mcur, error); + + return error; +} + +/* Section (II): Reserving space for new rmapbt and setting free space bitmap */ + +struct xrep_rmap_agfl { + struct xagb_bitmap *bitmap; + xfs_agnumber_t agno; +}; + +/* Add an AGFL block to the rmap list. */ +STATIC int +xrep_rmap_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_rmap_agfl *ra = priv; + + return xagb_bitmap_set(ra->bitmap, agbno, 1); +} + +/* + * Run one round of reserving space for the new rmapbt and recomputing the + * number of blocks needed to store the previously observed rmapbt records and + * the ones we'll create for the free space metadata. When we don't need more + * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to + * true. + */ +STATIC int +xrep_rmap_try_reserve( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur, + struct xagb_bitmap *freesp_blocks, + uint64_t *blocks_reserved, + bool *done) +{ + struct xrep_rmap_agfl ra = { + .bitmap = freesp_blocks, + .agno = rr->sc->sa.pag->pag_agno, + }; + struct xfs_scrub *sc = rr->sc; + struct xrep_newbt_resv *resv, *n; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + uint64_t nr_blocks; /* RMB */ + uint64_t freesp_records; + int error; + + /* + * We're going to recompute new_btree.bload.nr_blocks at the end of + * this function to reflect however many btree blocks we need to store + * all the rmap records (including the ones that reflect the changes we + * made to support the new rmapbt blocks), so we save the old value + * here so we can decide if we've reserved enough blocks. + */ + nr_blocks = rr->new_btree.bload.nr_blocks; + + /* + * Make sure we've reserved enough space for the new btree. This can + * change the shape of the free space btrees, which can cause secondary + * interactions with the rmap records because all three space btrees + * have the same rmap owner. We'll account for all that below. + */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + nr_blocks - *blocks_reserved); + if (error) + return error; + + *blocks_reserved = rr->new_btree.bload.nr_blocks; + + /* Clear everything in the bitmap. */ + xagb_bitmap_destroy(freesp_blocks); + + /* Set all the bnobt blocks in the bitmap. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + return error; + + /* Set all the cntbt blocks in the bitmap. */ + sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); + xfs_btree_del_cursor(sc->sa.cnt_cur, error); + sc->sa.cnt_cur = NULL; + if (error) + return error; + + /* Record our new btreeblks value. */ + rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; + + /* Set all the new rmapbt blocks in the bitmap. */ + list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { + error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); + if (error) + return error; + } + + /* Set all the AGFL blocks in the bitmap. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); + if (error) + return error; + + /* Count the extents in the bitmap. */ + freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); + + /* Compute how many blocks we'll need for all the rmaps. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records + freesp_records); + if (error) + return error; + + /* We're done when we don't need more blocks. */ + *done = nr_blocks >= rr->new_btree.bload.nr_blocks; + return 0; +} + +/* + * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for + * the free space metadata. This implements section (II) above. + */ +STATIC int +xrep_rmap_reserve_space( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur) +{ + struct xagb_bitmap freesp_blocks; /* AGBIT */ + uint64_t blocks_reserved = 0; + bool done = false; + int error; + + /* Compute how many blocks we'll need for the rmaps collected so far. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, rr->nr_records); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(rr->sc, &error)) + return error; + + xagb_bitmap_init(&freesp_blocks); + + /* + * Iteratively reserve space for the new rmapbt and recompute the + * number of blocks needed to store the previously observed rmapbt + * records and the ones we'll create for the free space metadata. + * Finish when we don't need more blocks. + */ + do { + error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, + &blocks_reserved, &done); + if (error) + goto out_bitmap; + } while (!done); + + /* Emit rmaps for everything in the free space bitmap. */ + xrep_ag_btcur_init(rr->sc, &rr->sc->sa); + error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); + xchk_ag_btcur_free(&rr->sc->sa); + +out_bitmap: + xagb_bitmap_destroy(&freesp_blocks); + return error; +} + +/* Section (III): Building the new rmap btree. */ + +/* Update the AGF counters. */ +STATIC int +xrep_rmap_reset_counters( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t rmap_btblocks; + + /* + * The AGF header contains extra information related to the reverse + * mapping btree, so we must update those fields here. + */ + rmap_btblocks = rr->new_btree.afake.af_blocks - 1; + agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the rmapbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_rmap_level = pag->pagf_rmap_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* Retrieve rmapbt data for bulk load. */ +STATIC int +xrep_rmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); + if (error) + return error; + if (!stat) + return -EFSCORRUPTED; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Custom allocation function for new rmap btrees. */ +STATIC int +xrep_rmap_alloc_vextent( + struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint) +{ + int error; + + /* + * We don't want an rmap update on the allocation, since we iteratively + * compute the OWN_AG records /after/ allocating blocks for the records + * that we already know we need to store. Therefore, fix the freelist + * with the NORMAP flag set so that we don't also try to create an rmap + * for new AGFL blocks. + */ + error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); + if (error) + return error; + + /* + * If xrep_fix_freelist fixed the freelist by moving blocks from the + * free space btrees or by removing blocks from the AGFL and queueing + * an EFI to free the block, the transaction will be dirty. This + * second case is of interest to us. + * + * Later on, we will need to compare gaps in the new recordset against + * the block usage of all OWN_AG owners in order to free the old + * btree's blocks, which means that we can't have EFIs for former AGFL + * blocks attached to the repair transaction when we commit the new + * btree. + * + * xrep_newbt_alloc_blocks guarantees this for us by calling + * xrep_defer_finish to commit anything that fix_freelist may have + * added to the transaction. + */ + return xfs_alloc_vextent_near_bno(args, alloc_hint); +} + + +/* Count the records in this btree. */ +STATIC int +xrep_rmap_count_records( + struct xfs_btree_cur *cur, + unsigned long long *nr) +{ + int running = 1; + int error; + + *nr = 0; + + error = xfs_btree_goto_left_edge(cur); + if (error) + return error; + + while (running && !(error = xfs_btree_increment(cur, 0, &running))) { + if (running) + (*nr)++; + } + + return error; +} +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rmap_build_new_tree( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *rmap_cur; + xfs_fsblock_t fsbno; + int error; + + /* + * Preserve the old rmapbt block count so that we can adjust the + * per-AG rmapbt reservation after we commit the new btree root and + * want to dispose of the old btree blocks. + */ + rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. The new blocks are accounted to the + * rmapbt per-AG reservation, which we will adjust further after + * committing the new btree. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, + fsbno, XFS_AG_RESV_RMAPBT); + rr->new_btree.bload.get_records = xrep_rmap_get_records; + rr->new_btree.bload.claim_block = xrep_rmap_claim_block; + rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; + rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); + + /* + * Initialize @rr->new_btree, reserve space for the new rmapbt, + * and compute OWN_AG rmaps. + */ + error = xrep_rmap_reserve_space(rr, rmap_cur); + if (error) + goto err_cur; + + /* + * Count the rmapbt records again, because the space reservation + * for the rmapbt itself probably added more records to the btree. + */ + rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, + &rr->rmap_btree); + + error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); + if (error) + goto err_mcur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; + + /* + * Move the cursor to the left edge of the tree so that the first + * increment in ->get_records positions us at the first record. + */ + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_level; + + /* Add all observed rmap records. */ + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); + + /* + * The newly committed rmap recordset includes mappings for the blocks + * that we reserved to build the new btree. If there is excess space + * reservation to be freed, the corresponding rmap records must also be + * removed. + */ + rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_rmap_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_rmap_level = 0; +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); +err_cur: + xfs_btree_del_cursor(rmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Section (IV): Reaping the old btree. */ + +struct xrep_rmap_find_gaps { + struct xagb_bitmap rmap_gaps; + xfs_agblock_t next_agbno; +}; + +/* Subtract each free extent in the bnobt from the rmap gaps. */ +STATIC int +xrep_rmap_find_freesp( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + + return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, + rec->ar_blockcount); +} + +/* Record the free space we find, as part of cleaning out the btree. */ +STATIC int +xrep_rmap_find_gaps( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + int error; + + if (rec->rm_startblock > rfg->next_agbno) { + error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, + rec->rm_startblock - rfg->next_agbno); + if (error) + return error; + } + + rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* + * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make + * a list of gaps in the rmap records and a list of the extents mentioned in + * the bnobt. Any block that's in the new rmapbt gap list but not mentioned + * in the bnobt is a block from the old rmapbt and can be removed. + */ +STATIC int +xrep_rmap_remove_old_tree( + struct xrep_rmap *rr) +{ + struct xrep_rmap_find_gaps rfg = { + .next_agbno = 0, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_btree_cur *mcur; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&rfg.rmap_gaps); + + /* Compute free space from the new rmapbt. */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + + error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_bitmap; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (rfg.next_agbno < agend) { + error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, + agend - rfg.next_agbno); + if (error) + goto out_bitmap; + } + + /* Compute free space from the existing bnobt. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, + &rfg); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + goto out_bitmap; + + /* + * Free the "free" blocks that the new rmapbt knows about but the bnobt + * doesn't--these are the old rmapbt blocks. Credit the old rmapbt + * block usage count back to the per-AG rmapbt reservation (and not + * fdblocks, since the rmap btree lives in free space) to keep the + * reservation and free space accounting correct. + */ + error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, + &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); + if (error) + goto out_bitmap; + + /* + * Now that we've zapped all the old rmapbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservation. + */ + pag->pagf_repair_rmap_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; +out_bitmap: + xagb_bitmap_destroy(&rfg.rmap_gaps); + return error; +} + +static inline bool +xrep_rmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * Before unlocking the AG header to perform the inode scan, we + * recorded reverse mappings for all AG metadata except for the OWN_AG + * metadata. IOWs, the in-memory btree knows about the AG headers, the + * two inode btrees, the CoW staging extents, and the refcount btrees. + * For these types of metadata, we need to record the live updates in + * the in-memory rmap btree. + * + * However, we do not scan the free space btrees or the AGFL until we + * have re-locked the AGF and are ready to reserve space for the new + * rmap btree, so we do not want live updates for OWN_AG metadata. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return oi->oi_owner != XFS_RMAP_OWN_AG; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the AGF buffer and is generating + * the update, so we must be careful about which parts of the struct xrep_rmap + * that we change. + */ +static int +xrep_rmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + mutex_unlock(&rr->lock); + xchk_iscan_abort(&rr->iscan); +out_unlock: + return NOTIFY_DONE; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rmap_setup_scan( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + int error; + + mutex_init(&rr->lock); + + /* Set up in-memory rmap btree */ + error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, + sc->sa.pag->pag_agno); + if (error) + goto out_mutex; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * AGF buffer to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); + error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + if (error) + goto out_iscan; + return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); +out_mutex: + mutex_destroy(&rr->lock); + return error; +} + +/* Tear down scan components. */ +STATIC void +xrep_rmap_teardown( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); + mutex_destroy(&rr->lock); +} + +/* Repair the rmap btree for some AG. */ +int +xrep_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr = sc->buf; + int error; + + error = xrep_rmap_setup_scan(rr); + if (error) + return error; + + /* + * Collect rmaps for everything in this AG that isn't space metadata. + * These rmaps won't change even as we try to allocate blocks. + */ + error = xrep_rmap_find_rmaps(rr); + if (error) + goto out_records; + + /* Rebuild the rmap information. */ + error = xrep_rmap_build_new_tree(rr); + if (error) + goto out_records; + + /* Kill the old tree. */ + error = xrep_rmap_remove_old_tree(rr); + if (error) + goto out_records; + +out_records: + xrep_rmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index b1ff4f3332..5055092bd9 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -119,7 +119,7 @@ xfsum_load( xfs_rtsumoff_t sumoff, union xfs_suminfo_raw *rawinfo) { - return xfile_obj_load(sc->xfile, rawinfo, + return xfile_load(sc->xfile, rawinfo, sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } @@ -130,7 +130,7 @@ xfsum_store( xfs_rtsumoff_t sumoff, const union xfs_suminfo_raw rawinfo) { - return xfile_obj_store(sc->xfile, &rawinfo, + return xfile_store(sc->xfile, &rawinfo, sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } @@ -142,7 +142,7 @@ xfsum_copyout( union xfs_suminfo_raw *rawinfo, unsigned int nr_words) { - return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, + return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, sumoff << XFS_WORDLOG); } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index caf324c2b9..20fac9723c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -15,6 +15,8 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_scrub.h" +#include "xfs_buf_mem.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -157,6 +159,15 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_DRAIN) xfs_drain_wait_disable(); + if (sc->flags & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_disable(); + + if (sc->flags & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_disable(); + + if (sc->flags & XCHK_FSGATES_RMAP) + xfs_rmap_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } @@ -184,6 +195,10 @@ xchk_teardown( sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xmbtp) { + xmbuf_free(sc->xmbtp); + sc->xmbtp = NULL; + } if (sc->xfile) { xfile_destroy(sc->xfile); sc->xfile = NULL; @@ -267,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, .has = xfs_has_rmapbt, - .repair = xrep_notsupported, + .repair = xrep_rmapbt, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, @@ -358,7 +373,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_fscounters, .scrub = xchk_fscounters, - .repair = xrep_notsupported, + .repair = xrep_fscounters, + }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ + .type = ST_FS, + .setup = xchk_setup_quotacheck, + .scrub = xchk_quotacheck, + .repair = xrep_quotacheck, + }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_nlinks, + }, + [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ + .type = ST_FS, + .setup = xchk_setup_fs, + .scrub = xchk_health_record, + .repair = xrep_notsupported, }, }; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 7fc50654c4..9ad65b604f 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -99,6 +99,9 @@ struct xfs_scrub { /* xfile used by the scrubbers; freed at teardown. */ struct xfile *xfile; + /* buffer target for in-memory btrees; also freed at teardown. */ + struct xfs_buftarg *xmbtp; + /* Lock flags for @ip. */ uint ilock_flags; @@ -121,6 +124,9 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ +#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ +#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -130,7 +136,10 @@ struct xfs_scrub { * features are gated off via dynamic code patching, which is why the state * must be enabled during scrub setup and can only be torn down afterwards. */ -#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ + XCHK_FSGATES_QUOTA | \ + XCHK_FSGATES_DIRENTS | \ + XCHK_FSGATES_RMAP) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); @@ -167,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc) #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); +int xchk_quotacheck(struct xfs_scrub *sc); #else static inline int xchk_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index cd91db4a55..42cafbed94 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", + [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", + [XFS_SCRUB_TYPE_NLINKS] = "nlinks", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ @@ -329,9 +331,9 @@ xchk_stats_register( if (!cs->cs_debugfs) return; - debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + debugfs_create_file("stats", 0444, cs->cs_debugfs, cs, &scrub_stats_fops); - debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs, &clear_scrub_stats_fops); } diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index ddff86713d..d77d8a9598 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -13,6 +13,7 @@ #include "xfs_inode.h" #include "xfs_symlink.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/health.h" @@ -67,7 +68,7 @@ xchk_symlink( } /* Remote symlink; must read the contents. */ - error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + error = xfs_symlink_remote_read(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index d0e24ffaf7..3dd281d6d1 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -16,10 +16,16 @@ #include "xfs_rtbitmap.h" #include "xfs_quota.h" #include "xfs_quota_defs.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/fscounters.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t @@ -32,7 +38,7 @@ xchk_btree_cur_fsbno( xfs_buf_daddr(cur->bc_levels[level].bp)); if (level == cur->bc_nlevels - 1 && - (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + cur->bc_ops->type == XFS_BTREE_TYPE_INODE) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); return NULLFSBLOCK; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 6bbb4e8639..5b294be52c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -15,11 +15,17 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +#include "xfs_quota_defs.h" +struct xfs_scrub; struct xfile; struct xfarray; struct xfarray_sortinfo; struct xchk_dqiter; +struct xchk_iscan; +struct xchk_nlink; +struct xchk_fscounters; +struct xfs_rmap_update_params; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -27,14 +33,6 @@ struct xchk_dqiter; * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); @@ -63,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -89,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ - { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ + { XFS_SCRUB_TYPE_HEALTHY, "healthy" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -107,9 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ + { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ + { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } +TRACE_DEFINE_ENUM(XFS_RMAP_MAP); +TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC); +TRACE_DEFINE_ENUM(XFS_RMAP_FREE); + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -395,6 +411,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); + +TRACE_EVENT(xchk_qcheck_error, + TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id, + void *ret_ip), + TP_ARGS(sc, dqtype, id, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_dqid_t, id) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->dqtype = dqtype; + __entry->id = id; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->id, + __entry->ret_ip) +); #endif /* CONFIG_XFS_QUOTA */ TRACE_EVENT(xchk_incomplete, @@ -423,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -436,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -444,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -465,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, ptr) __field(xfs_agnumber_t, agno) @@ -479,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -487,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -508,7 +547,7 @@ TRACE_EVENT(xchk_btree_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -519,17 +558,17 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -546,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -559,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -586,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) __field(int, level) @@ -598,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; ), - TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->bno, __entry->level, @@ -861,18 +900,11 @@ TRACE_EVENT(xfile_destroy, __field(loff_t, size) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes = inode->i_blocks << SECTOR_SHIFT; + __entry->size = i_size_read(inode); ), TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", __entry->ino, @@ -891,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class, __field(unsigned long long, bytecount) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; + struct inode *inode = file_inode(xf->file); - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes_used = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes_used = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + __entry->ino = inode->i_ino; + __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT; __entry->pos = pos; + __entry->size = i_size_read(inode); __entry->bytecount = bytecount; ), TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", @@ -917,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class, DEFINE_EVENT(xfile_class, name, \ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ TP_ARGS(xf, pos, bytecount)) -DEFINE_XFILE_EVENT(xfile_pread); -DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_load); +DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); -DEFINE_XFILE_EVENT(xfile_get_page); -DEFINE_XFILE_EVENT(xfile_put_page); +DEFINE_XFILE_EVENT(xfile_get_folio); +DEFINE_XFILE_EVENT(xfile_put_folio); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), @@ -968,7 +993,7 @@ TRACE_EVENT(xfarray_isort, __entry->hi - __entry->lo) ); -TRACE_EVENT(xfarray_pagesort, +TRACE_EVENT(xfarray_foliosort, TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), TP_ARGS(si, lo, hi), TP_STRUCT__entry( @@ -1039,6 +1064,47 @@ TRACE_EVENT(xfarray_sort, __entry->bytes) ); +TRACE_EVENT(xfarray_sort_scan, + TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx), + TP_ARGS(si, idx), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(unsigned long long, idx) + __field(unsigned long long, folio_pos) + __field(unsigned long, folio_bytes) + __field(unsigned long long, first_idx) + __field(unsigned long long, last_idx) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->idx = idx; + if (si->folio) { + __entry->folio_pos = folio_pos(si->folio); + __entry->folio_bytes = folio_size(si->folio); + __entry->first_idx = si->first_folio_idx; + __entry->last_idx = si->last_folio_idx; + } else { + __entry->folio_pos = 0; + __entry->folio_bytes = 0; + __entry->first_idx = 0; + __entry->last_idx = 0; + } + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->idx, + __entry->folio_pos, + __entry->folio_bytes, + __entry->first_idx, + __entry->last_idx) +); + TRACE_EVENT(xfarray_sort_stats, TP_PROTO(struct xfarray_sortinfo *si, int error), TP_ARGS(si, error), @@ -1119,6 +1185,323 @@ TRACE_EVENT(xchk_rtsum_record_free, ); #endif /* CONFIG_XFS_RT */ +DECLARE_EVENT_CLASS(xchk_iscan_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited) +) +#define DEFINE_ISCAN_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); +DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_skip); +DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); + +DECLARE_EVENT_CLASS(xchk_iscan_ino_class, + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), + TP_ARGS(iscan, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, startino) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->startino = iscan->scan_start_ino; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->startino, + __entry->cursor, + __entry->visited, + __entry->ino) +) +#define DEFINE_ISCAN_INO_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_ino_class, name, \ + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ + TP_ARGS(iscan, ino)) +DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); +DEFINE_ISCAN_INO_EVENT(xchk_iscan_start); + +TRACE_EVENT(xchk_iscan_iget, + TP_PROTO(struct xchk_iscan *iscan, int error), + TP_ARGS(iscan, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->error = error; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->error) +); + +TRACE_EVENT(xchk_iscan_iget_batch, + TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, + unsigned int nr, unsigned int avail), + TP_ARGS(mp, iscan, nr, avail), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, nr) + __field(unsigned int, avail) + __field(unsigned int, unavail) + __field(xfs_ino_t, batch_ino) + __field(unsigned long long, skipmask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->nr = nr; + __entry->avail = avail; + __entry->unavail = hweight64(iscan->__skipped_inomask); + __entry->batch_ino = iscan->__batch_ino; + __entry->skipmask = iscan->__skipped_inomask; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->batch_ino, + __entry->skipmask, + __entry->nr, + __entry->avail, + __entry->unavail) +); + +TRACE_EVENT(xchk_iscan_iget_retry_wait, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, retry_delay) + __field(unsigned long, remaining) + __field(unsigned int, iget_timeout) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->retry_delay = iscan->iget_retry_delay; + __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies); + __entry->iget_timeout = iscan->iget_timeout; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->remaining, + __entry->iget_timeout, + __entry->retry_delay) +); + +TRACE_EVENT(xchk_nlinks_collect_dirent, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + xfs_ino_t ino, const struct xfs_name *name), + TP_ARGS(mp, dp, ino, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp, + int action, xfs_ino_t ino, int delta, + const char *name, unsigned int namelen), + TP_ARGS(mp, dp, action, ino, delta, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(int, action) + __field(xfs_ino_t, ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->action = action; + __entry->ino = ino; + __entry->delta = delta; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +TRACE_EVENT(xchk_nlinks_update_incore, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live, int parents_delta, + int backrefs_delta, int children_delta), + TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + __field(int, parents_delta) + __field(int, backrefs_delta) + __field(int, children_delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + __entry->parents_delta = parents_delta; + __entry->backrefs_delta = backrefs_delta; + __entry->children_delta = children_delta; + ), + TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents_delta, + __entry->parents, + __entry->backrefs_delta, + __entry->backrefs, + __entry->children_delta, + __entry->children) +); + +DECLARE_EVENT_CLASS(xchk_nlinks_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->nlink, + __entry->parents, + __entry->backrefs, + __entry->children) +); +#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlinks_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -1223,7 +1606,6 @@ DEFINE_EVENT(xrep_rmap_class, name, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); -DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, @@ -1341,6 +1723,38 @@ TRACE_EVENT(xrep_bmap_found, __entry->state) ); +TRACE_EVENT(xrep_rmap_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_rmap_irec *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), @@ -1425,16 +1839,28 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->refcbt_sz) ) TRACE_EVENT(xrep_reset_counters, - TP_PROTO(struct xfs_mount *mp), - TP_ARGS(mp), + TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), + TP_ARGS(mp, fsc), TP_STRUCT__entry( __field(dev_t, dev) + __field(uint64_t, icount) + __field(uint64_t, ifree) + __field(uint64_t, fdblocks) + __field(uint64_t, frextents) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __entry->icount = fsc->icount; + __entry->ifree = fsc->ifree; + __entry->fdblocks = fsc->fdblocks; + __entry->frextents = fsc->frextents; ), - TP_printk("dev %d:%d", - MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount, + __entry->ifree, + __entry->fdblocks, + __entry->frextents) ) DECLARE_EVENT_CLASS(xrep_newbt_extent_class, @@ -1645,6 +2071,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps, __entry->attr_extents) ); +TRACE_EVENT(xrep_dinode_findmode_dirent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype), + TP_ARGS(sc, dp, ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent_inval, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype, unsigned int found_ftype), + TP_ARGS(sc, dp, ftype, found_ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + __field(unsigned int, found_ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + __entry->found_ftype = found_ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR)) +); + TRACE_EVENT(xrep_cow_mark_file_range, TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, xfs_fileoff_t startoff, xfs_filblks_t blockcount), @@ -1756,8 +2231,48 @@ DEFINE_EVENT(xrep_dquot_class, name, \ DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); #endif /* CONFIG_XFS_QUOTA */ +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); + +TRACE_EVENT(xrep_rmap_live_update, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + const struct xfs_rmap_update_params *p), + TP_ARGS(mp, agno, op, p), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, op) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->op = op; + __entry->agbno = p->startblock; + __entry->len = p->blockcount; + xfs_owner_info_unpack(&p->oinfo, &__entry->owner, + &__entry->offset, &__entry->flags); + if (p->unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->op, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index f0f532c10a..17c982a482 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -16,7 +16,7 @@ * Large Arrays of Fixed-Size Records * ================================== * - * This memory array uses an xfile (which itself is a memfd "file") to store + * This memory array uses an xfile (which itself is a shmem file) to store * large numbers of fixed-size records in memory that can be paged out. This * puts less stress on the memory reclaim algorithms during an online repair * because we don't have to pin so much memory. However, array access is less @@ -136,7 +136,7 @@ xfarray_load( if (idx >= array->nr) return -ENODATA; - return xfile_obj_load(array->xfile, ptr, array->obj_size, + return xfile_load(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); } @@ -152,7 +152,7 @@ xfarray_is_unset( if (array->unset_slots == 0) return false; - error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (!error && xfarray_element_is_null(array, temp)) return true; @@ -184,7 +184,7 @@ xfarray_unset( return 0; memset(temp, 0, array->obj_size); - error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + error = xfile_store(array->xfile, temp, array->obj_size, pos); if (error) return error; @@ -209,7 +209,7 @@ xfarray_store( ASSERT(!xfarray_element_is_null(array, ptr)); - ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + ret = xfile_store(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); if (ret) return ret; @@ -245,12 +245,12 @@ xfarray_store_anywhere( for (pos = 0; pos < endpos && array->unset_slots > 0; pos += array->obj_size) { - error = xfile_obj_load(array->xfile, temp, array->obj_size, + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (error || !xfarray_element_is_null(array, temp)) continue; - error = xfile_obj_store(array->xfile, ptr, array->obj_size, + error = xfile_store(array->xfile, ptr, array->obj_size, pos); if (error) return error; @@ -552,7 +552,7 @@ xfarray_isort( trace_xfarray_isort(si, lo, hi); xfarray_sort_bump_loads(si); - error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + error = xfile_load(si->array->xfile, scratch, len, lo_pos); if (error) return error; @@ -560,88 +560,45 @@ xfarray_isort( sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); + return xfile_store(si->array->xfile, scratch, len, lo_pos); } -/* Grab a page for sorting records. */ -static inline int -xfarray_sort_get_page( - struct xfarray_sortinfo *si, - loff_t pos, - uint64_t len) -{ - int error; - - error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); - if (error) - return error; - - /* - * xfile pages must never be mapped into userspace, so we skip the - * dcache flush when mapping the page. - */ - si->page_kaddr = kmap_local_page(si->xfpage.page); - return 0; -} - -/* Release a page we grabbed for sorting records. */ -static inline int -xfarray_sort_put_page( - struct xfarray_sortinfo *si) -{ - if (!si->page_kaddr) - return 0; - - kunmap_local(si->page_kaddr); - si->page_kaddr = NULL; - - return xfile_put_page(si->array->xfile, &si->xfpage); -} - -/* Decide if these records are eligible for in-page sorting. */ -static inline bool -xfarray_want_pagesort( - struct xfarray_sortinfo *si, - xfarray_idx_t lo, - xfarray_idx_t hi) -{ - pgoff_t lo_page; - pgoff_t hi_page; - loff_t end_pos; - - /* We can only map one page at a time. */ - lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; - end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; - hi_page = end_pos >> PAGE_SHIFT; - - return lo_page == hi_page; -} - -/* Sort a bunch of records that all live in the same memory page. */ +/* + * Sort the records from lo to hi (inclusive) if they are all backed by the + * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative + * errno. + */ STATIC int -xfarray_pagesort( +xfarray_foliosort( struct xfarray_sortinfo *si, xfarray_idx_t lo, xfarray_idx_t hi) { + struct folio *folio; void *startp; loff_t lo_pos = xfarray_pos(si->array, lo); - uint64_t len = xfarray_pos(si->array, hi - lo); - int error = 0; + uint64_t len = xfarray_pos(si->array, hi - lo + 1); - trace_xfarray_pagesort(si, lo, hi); + /* No single folio could back this many records. */ + if (len > XFILE_MAX_FOLIO_SIZE) + return 0; xfarray_sort_bump_loads(si); - error = xfarray_sort_get_page(si, lo_pos, len); - if (error) - return error; + folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC); + if (IS_ERR(folio)) + return PTR_ERR(folio); + if (!folio) + return 0; + + trace_xfarray_foliosort(si, lo, hi); xfarray_sort_bump_heapsorts(si); - startp = si->page_kaddr + offset_in_page(lo_pos); + startp = folio_address(folio) + offset_in_folio(folio, lo_pos); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfarray_sort_put_page(si); + xfile_put_folio(si->array->xfile, folio); + return 1; } /* Return a pointer to the xfarray pivot record within the sortinfo struct. */ @@ -829,63 +786,78 @@ xfarray_qsort_push( return 0; } +static inline void +xfarray_sort_scan_done( + struct xfarray_sortinfo *si) +{ + if (si->folio) + xfile_put_folio(si->array->xfile, si->folio); + si->folio = NULL; +} + /* - * Load an element from the array into the first scratchpad and cache the page, - * if possible. + * Cache the folio backing the start of the given array element. If the array + * element is contained entirely within the folio, return a pointer to the + * cached folio. Otherwise, load the element into the scratchpad and return a + * pointer to the scratchpad. */ static inline int -xfarray_sort_load_cached( +xfarray_sort_scan( struct xfarray_sortinfo *si, xfarray_idx_t idx, - void *ptr) + void **ptrp) { loff_t idx_pos = xfarray_pos(si->array, idx); - pgoff_t startpage; - pgoff_t endpage; int error = 0; - /* - * If this load would split a page, release the cached page, if any, - * and perform a traditional read. - */ - startpage = idx_pos >> PAGE_SHIFT; - endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; - if (startpage != endpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + if (xfarray_sort_terminated(si, &error)) + return error; - if (xfarray_sort_terminated(si, &error)) - return error; + trace_xfarray_sort_scan(si, idx); - return xfile_obj_load(si->array->xfile, ptr, - si->array->obj_size, idx_pos); - } + /* If the cached folio doesn't cover this index, release it. */ + if (si->folio && + (idx < si->first_folio_idx || idx > si->last_folio_idx)) + xfarray_sort_scan_done(si); - /* If the cached page is not the one we want, release it. */ - if (xfile_page_cached(&si->xfpage) && - xfile_page_index(&si->xfpage) != startpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + /* Grab the first folio that backs this array element. */ + if (!si->folio) { + loff_t next_pos; + + si->folio = xfile_get_folio(si->array->xfile, idx_pos, + si->array->obj_size, XFILE_ALLOC); + if (IS_ERR(si->folio)) + return PTR_ERR(si->folio); + + si->first_folio_idx = xfarray_idx(si->array, + folio_pos(si->folio) + si->array->obj_size - 1); + + next_pos = folio_pos(si->folio) + folio_size(si->folio); + si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); + if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) + si->last_folio_idx--; + + trace_xfarray_sort_scan(si, idx); } /* - * If we don't have a cached page (and we know the load is contained - * in a single page) then grab it. + * If this folio still doesn't cover the desired element, it must cross + * a folio boundary. Read into the scratchpad and we're done. */ - if (!xfile_page_cached(&si->xfpage)) { - if (xfarray_sort_terminated(si, &error)) - return error; + if (idx < si->first_folio_idx || idx > si->last_folio_idx) { + void *temp = xfarray_scratch(si->array); - error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, - PAGE_SIZE); + error = xfile_load(si->array->xfile, temp, si->array->obj_size, + idx_pos); if (error) return error; + + *ptrp = temp; + return 0; } - memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), - si->array->obj_size); + /* Otherwise return a pointer to the array element in the folio. */ + *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos); return 0; } @@ -952,6 +924,8 @@ xfarray_sort( pivot = xfarray_sortinfo_pivot(si); while (si->stack_depth >= 0) { + int ret; + lo = si_lo[si->stack_depth]; hi = si_hi[si->stack_depth]; @@ -964,13 +938,13 @@ xfarray_sort( } /* - * If directly mapping the page and sorting can solve our + * If directly mapping the folio and sorting can solve our * problems, we're done. */ - if (xfarray_want_pagesort(si, lo, hi)) { - error = xfarray_pagesort(si, lo, hi); - if (error) - goto out_free; + ret = xfarray_foliosort(si, lo, hi); + if (ret < 0) + goto out_free; + if (ret == 1) { si->stack_depth--; continue; } @@ -995,25 +969,24 @@ xfarray_sort( * than the pivot is on the right side of the range. */ while (lo < hi) { + void *p; + /* * Decrement hi until it finds an a[hi] less than the * pivot value. */ - error = xfarray_sort_load_cached(si, hi, scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) { hi--; - error = xfarray_sort_load_cached(si, hi, - scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; @@ -1028,21 +1001,18 @@ xfarray_sort( * Increment lo until it finds an a[lo] greater than * the pivot value. */ - error = xfarray_sort_load_cached(si, lo, scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) { lo++; - error = xfarray_sort_load_cached(si, lo, - scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 62b9c506fd..acb2f94c56 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +/* + * Load an array element, but zero the buffer if there's no data because we + * haven't stored to that array element yet. + */ +static inline int +xfarray_load_sparse( + struct xfarray *array, + uint64_t idx, + void *rec) +{ + int error = xfarray_load(array, idx, rec); + + if (error == -ENODATA) { + memset(rec, 0, array->obj_size); + return 0; + } + return error; +} + /* Append an element to the array. */ static inline int xfarray_append(struct xfarray *array, const void *ptr) { @@ -105,9 +124,14 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; - /* Cache a page here for faster access. */ - struct xfile_page xfpage; - void *page_kaddr; + /* Cache a folio here for faster scanning for pivots */ + struct folio *folio; + + /* First array index in folio that is completely readable */ + xfarray_idx_t first_folio_idx; + + /* Last array index in folio that is completely readable */ + xfarray_idx_t last_folio_idx; #ifdef DEBUG /* Performance statistics. */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 090c3ead43..8cdd863db5 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -34,13 +34,6 @@ * xfiles assume that the caller will handle all required concurrency * management; standard vfs locks (freezer and inode) are not taken. Reads * and writes are satisfied directly from the page cache. - * - * NOTE: The current shmemfs implementation has a quirk that in-kernel reads - * of a hole cause a page to be mapped into the file. If you are going to - * create a sparse xfile, please be careful about reading from uninitialized - * parts of the file. These pages are !Uptodate and will eventually be - * reclaimed if not written, but in the short term this boosts memory - * consumption. */ /* @@ -62,38 +55,27 @@ xfile_create( { struct inode *inode; struct xfile *xf; - int error = -ENOMEM; + int error; xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); if (!xf) return -ENOMEM; - xf->file = shmem_file_setup(description, isize, 0); - if (!xf->file) - goto out_xfile; + xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; } - /* - * We want a large sparse file that we can pread, pwrite, and seek. - * xfile users are responsible for keeping the xfile hidden away from - * all other callers, so we skip timestamp updates and security checks. - * Make the inode only accessible by root, just in case the xfile ever - * escapes. - */ - xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | - FMODE_LSEEK; - xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; inode = file_inode(xf->file); - inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; - inode->i_mode &= ~0177; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + trace_xfile_create(xf); *xfilep = xf; @@ -118,164 +100,128 @@ xfile_destroy( } /* - * Read a memory object directly from the xfile's page cache. Unlike regular - * pread, we return -E2BIG and -EFBIG for reads that are too large or at too - * high an offset, instead of truncating the read. Otherwise, we return - * bytes read or an error code, like regular pread. + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pread( +int +xfile_load( struct xfile *xf, void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - struct page *page = NULL; - ssize_t read = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pread(xf, pos, count); + trace_xfile_load(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { - void *p, *kaddr; + struct folio *folio; unsigned int len; + unsigned int offset; - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * In-kernel reads of a shmem file cause it to allocate a page - * if the mapping shows a hole. Therefore, if we hit ENOMEM - * we can continue by zeroing the caller's buffer. - */ - page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, - __GFP_NOWARN); - if (IS_ERR(page)) { - error = PTR_ERR(page); - if (error != -ENOMEM) - break; - - memset(buf, 0, len); - goto advance; - } - - if (PageUptodate(page)) { + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_READ) < 0) + break; + if (!folio) { /* - * xfile pages must never be mapped into userspace, so - * we skip the dcache flush. + * No data stored at this offset, just zero the output + * buffer until the next page boundary. */ - kaddr = kmap_local_page(page); - p = kaddr + offset_in_page(pos); - memcpy(buf, p, len); - kunmap_local(kaddr); - } else { + len = min_t(ssize_t, count, + PAGE_SIZE - offset_in_page(pos)); memset(buf, 0, len); - } - put_page(page); + } else { + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + break; + } + + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(buf, folio_address(folio) + offset, len); -advance: + folio_unlock(folio); + folio_put(folio); + } count -= len; pos += len; buf += len; - read += len; } memalloc_nofs_restore(pflags); - if (read > 0) - return read; - return error; + if (count) + return -ENOMEM; + return 0; } /* - * Write a memory object directly to the xfile's page cache. Unlike regular - * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too - * high an offset, instead of truncating the write. Otherwise, we return - * bytes written or an error code, like regular pwrite. + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pwrite( +int +xfile_store( struct xfile *xf, const void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - ssize_t written = 0; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pwrite(xf, pos, count); + trace_xfile_store(xf, pos, count); + + /* + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. + */ + if (pos + count > i_size_read(inode)) + i_size_write(inode, pos + count); pflags = memalloc_nofs_save(); while (count > 0) { - void *fsdata = NULL; - void *p, *kaddr; + struct folio *folio; unsigned int len; - int ret; - - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. - * shmem doesn't support fs freeze, but lockdep doesn't know - * that and will trip over that. - */ - error = aops->write_begin(NULL, mapping, pos, len, &page, - &fsdata); - if (error) - break; + unsigned int offset; - /* - * xfile pages must never be mapped into userspace, so we skip - * the dcache flush. If the page is not uptodate, zero it - * before writing data. - */ - kaddr = kmap_local_page(page); - if (!PageUptodate(page)) { - memset(kaddr, 0, PAGE_SIZE); - SetPageUptodate(page); - } - p = kaddr + offset_in_page(pos); - memcpy(p, buf, len); - kunmap_local(kaddr); - - ret = aops->write_end(NULL, mapping, pos, len, len, page, - fsdata); - if (ret < 0) { - error = ret; + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_CACHE) < 0) + break; + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); break; } - written += ret; - if (ret != len) - break; + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(folio_address(folio) + offset, buf, len); + + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); - count -= ret; - pos += ret; - buf += ret; + count -= len; + pos += len; + buf += len; } memalloc_nofs_restore(pflags); - if (written > 0) - return written; - return error; + if (count) + return -ENOMEM; + return 0; } /* Find the next written area in the xfile data for a given offset. */ @@ -291,129 +237,76 @@ xfile_seek_data( return ret; } -/* Query stat information for an xfile. */ -int -xfile_stat( - struct xfile *xf, - struct xfile_stat *statbuf) -{ - struct kstat ks; - int error; - - error = vfs_getattr_nosec(&xf->file->f_path, &ks, - STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); - if (error) - return error; - - statbuf->size = ks.size; - statbuf->bytes = ks.blocks << SECTOR_SHIFT; - return 0; -} - /* - * Grab the (locked) page for a memory object. The object cannot span a page - * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we - * cannot grab the page, or the usual negative errno. + * Grab the (locked) folio for a memory object. The object cannot span a folio + * boundary. Returns the locked folio if successful, NULL if there was no + * folio or it didn't cover the range requested, or an ERR_PTR on failure. */ -int -xfile_get_page( +struct folio * +xfile_get_folio( struct xfile *xf, loff_t pos, - unsigned int len, - struct xfile_page *xfpage) + size_t len, + unsigned int flags) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - void *fsdata = NULL; - loff_t key = round_down(pos, PAGE_SIZE); + struct folio *folio = NULL; unsigned int pflags; int error; if (inode->i_sb->s_maxbytes - pos < len) - return -ENOMEM; - if (len > PAGE_SIZE - offset_in_page(pos)) - return -ENOTBLK; - - trace_xfile_get_page(xf, pos, len); + return ERR_PTR(-ENOMEM); - pflags = memalloc_nofs_save(); + trace_xfile_get_folio(xf, pos, len); /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. shmem - * doesn't support fs freeze, but lockdep doesn't know that and will - * trip over that. + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. */ - error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, - &fsdata); + if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode)) + i_size_write(inode, pos + len); + + pflags = memalloc_nofs_save(); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); + memalloc_nofs_restore(pflags); if (error) - goto out_pflags; + return ERR_PTR(error); - /* We got the page, so make sure we push out EOF. */ - if (i_size_read(inode) < pos + len) - i_size_write(inode, pos + len); + if (!folio) + return NULL; - /* - * If the page isn't up to date, fill it with zeroes before we hand it - * to the caller and make sure the backing store will hold on to them. - */ - if (!PageUptodate(page)) { - void *kaddr; + if (len > folio_size(folio) - offset_in_folio(folio, pos)) { + folio_unlock(folio); + folio_put(folio); + return NULL; + } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); - kunmap_local(kaddr); - SetPageUptodate(page); + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EIO); } /* - * Mark each page dirty so that the contents are written to some - * backing store when we drop this buffer, and take an extra reference - * to prevent the xfile page from being swapped or removed from the - * page cache by reclaim if the caller unlocks the page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfile_put_folio. */ - set_page_dirty(page); - get_page(page); - - xfpage->page = page; - xfpage->fsdata = fsdata; - xfpage->pos = key; -out_pflags: - memalloc_nofs_restore(pflags); - return error; + if (flags & XFILE_ALLOC) + folio_set_dirty(folio); + return folio; } /* - * Release the (locked) page for a memory object. Returns 0 or a negative - * errno. + * Release the (locked) folio for a memory object. */ -int -xfile_put_page( +void +xfile_put_folio( struct xfile *xf, - struct xfile_page *xfpage) + struct folio *folio) { - struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - unsigned int pflags; - int ret; - - trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); - - /* Give back the reference that we took in xfile_get_page. */ - put_page(xfpage->page); + trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio)); - pflags = memalloc_nofs_save(); - ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, - xfpage->page, xfpage->fsdata); - memalloc_nofs_restore(pflags); - memset(xfpage, 0, sizeof(struct xfile_page)); - - if (ret < 0) - return ret; - if (ret != PAGE_SIZE) - return -EIO; - return 0; + folio_unlock(folio); + folio_put(folio); } diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index d56643b0f4..76d78dba7e 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -6,22 +6,6 @@ #ifndef __XFS_SCRUB_XFILE_H__ #define __XFS_SCRUB_XFILE_H__ -struct xfile_page { - struct page *page; - void *fsdata; - loff_t pos; -}; - -static inline bool xfile_page_cached(const struct xfile_page *xfpage) -{ - return xfpage->page != NULL; -} - -static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) -{ - return xfpage->page->index; -} - struct xfile { struct file *file; }; @@ -29,49 +13,17 @@ struct xfile { int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); void xfile_destroy(struct xfile *xf); -ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); -ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, +int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); +int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); -/* - * Load an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pread(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - -/* - * Store an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pwrite(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - loff_t xfile_seek_data(struct xfile *xf, loff_t pos); -struct xfile_stat { - loff_t size; - unsigned long long bytes; -}; - -int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); +#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) -int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, - struct xfile_page *xbuf); -int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); +#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */ +struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, + unsigned int flags); +void xfile_put_folio(struct xfile *xf, struct folio *folio); #endif /* __XFS_SCRUB_XFILE_H__ */ |